diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,78466 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0036764705882355, + "eval_steps": 500, + "global_step": 2451, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 267.484375, + "completions/mean_terminated_length": 267.484375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.145115926861763, + "epoch": 0.0012254901960784314, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2699338046500006, + "kl": 0.0, + "learning_rate": 0.0, + "loss": -0.0224, + "num_tokens": 32911.0, + "reward": 0.78125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.8768293857574463, + "sampling/importance_sampling_ratio/mean": 1.0000336170196533, + "sampling/importance_sampling_ratio/min": 0.4881851375102997, + "sampling/sampling_logp_difference/max": 0.7170605659484863, + "sampling/sampling_logp_difference/mean": 0.011007876135408878, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 184.265625, + "completions/mean_terminated_length": 184.265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.17698192596435547, + "epoch": 0.0024509803921568627, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.279908744405416, + "kl": 0.0, + "learning_rate": 4.065040650406504e-09, + "loss": -0.0334, + "num_tokens": 60896.0, + "reward": -0.0625, + "reward_std": 0.644389271736145, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992719888687134, + "sampling/importance_sampling_ratio/min": 0.14117345213890076, + "sampling/sampling_logp_difference/max": 1.9577659368515015, + "sampling/sampling_logp_difference/mean": 0.015893325209617615, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 210.71875, + "completions/mean_terminated_length": 210.71875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.21332260966300964, + "epoch": 0.003676470588235294, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.5294807730188364, + "kl": 0.0008067585877142847, + "learning_rate": 8.130081300813008e-09, + "loss": -0.0791, + "num_tokens": 92574.0, + "reward": 0.3125, + "reward_std": 0.5501632690429688, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.7787456512451172, + "sampling/importance_sampling_ratio/mean": 0.9998263120651245, + "sampling/importance_sampling_ratio/min": 0.28856924176216125, + "sampling/sampling_logp_difference/max": 1.2428202629089355, + "sampling/sampling_logp_difference/mean": 0.016139939427375793, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 264.4375, + "completions/mean_terminated_length": 264.4375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.1841556429862976, + "epoch": 0.004901960784313725, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9956682594403414, + "kl": 0.0007476672763004899, + "learning_rate": 1.2195121951219512e-08, + "loss": -0.1092, + "num_tokens": 128282.0, + "reward": 0.375, + "reward_std": 0.7236068248748779, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002386569976807, + "sampling/importance_sampling_ratio/min": 0.3053293526172638, + "sampling/sampling_logp_difference/max": 1.1863641738891602, + "sampling/sampling_logp_difference/mean": 0.0141455614939332, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 152.421875, + "completions/mean_terminated_length": 152.421875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2214486002922058, + "epoch": 0.006127450980392157, + "frac_reward_zero_std": 0.25, + "grad_norm": 4.369864432046693, + "kl": 0.001004523248411715, + "learning_rate": 1.6260162601626016e-08, + "loss": 0.0366, + "num_tokens": 163285.0, + "reward": 0.25, + "reward_std": 0.6285127401351929, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9981650114059448, + "sampling/importance_sampling_ratio/min": 0.3985414505004883, + "sampling/sampling_logp_difference/max": 0.9357107877731323, + "sampling/sampling_logp_difference/mean": 0.020896129310131073, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 166.28125, + "completions/mean_terminated_length": 166.28125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.23074975609779358, + "epoch": 0.007352941176470588, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3140508936905912, + "kl": 0.0009867888875305653, + "learning_rate": 2.032520325203252e-08, + "loss": 0.0136, + "num_tokens": 189351.0, + "reward": -0.34375, + "reward_std": 0.8705305457115173, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.956235408782959, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 0.39710769057273865, + "sampling/sampling_logp_difference/max": 0.9235477447509766, + "sampling/sampling_logp_difference/mean": 0.020348988473415375, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 198.0, + "completions/mean_terminated_length": 198.0, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.1819380521774292, + "epoch": 0.00857843137254902, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.4418243651013998, + "kl": 0.0008182962192222476, + "learning_rate": 2.4390243902439023e-08, + "loss": -0.0953, + "num_tokens": 220759.0, + "reward": 0.40625, + "reward_std": 0.5959457159042358, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.871976613998413, + "sampling/importance_sampling_ratio/mean": 0.9996786713600159, + "sampling/importance_sampling_ratio/min": 0.31916162371635437, + "sampling/sampling_logp_difference/max": 1.1420576572418213, + "sampling/sampling_logp_difference/mean": 0.014583440497517586, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 192.546875, + "completions/mean_terminated_length": 192.546875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.1745167374610901, + "epoch": 0.00980392156862745, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5788609427873816, + "kl": 0.0007142078247852623, + "learning_rate": 2.8455284552845527e-08, + "loss": -0.0171, + "num_tokens": 250730.0, + "reward": 0.34375, + "reward_std": 0.5809217691421509, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.574067234992981, + "sampling/importance_sampling_ratio/mean": 1.0004377365112305, + "sampling/importance_sampling_ratio/min": 0.5096371173858643, + "sampling/sampling_logp_difference/max": 0.6740564107894897, + "sampling/sampling_logp_difference/mean": 0.012199976481497288, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 187.140625, + "completions/mean_terminated_length": 187.140625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.1829577535390854, + "epoch": 0.011029411764705883, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.8171989512948765, + "kl": 0.0006950580282136798, + "learning_rate": 3.252032520325203e-08, + "loss": -0.164, + "num_tokens": 284835.0, + "reward": 0.21875, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001541376113892, + "sampling/importance_sampling_ratio/min": 0.25570473074913025, + "sampling/sampling_logp_difference/max": 1.363731861114502, + "sampling/sampling_logp_difference/mean": 0.014619017019867897, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 199.390625, + "completions/mean_terminated_length": 199.390625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.21946877241134644, + "epoch": 0.012254901960784314, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.743495906786532, + "kl": 0.0010361624881625175, + "learning_rate": 3.658536585365853e-08, + "loss": -0.0196, + "num_tokens": 318204.0, + "reward": 0.25, + "reward_std": 0.6613117456436157, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999617338180542, + "sampling/importance_sampling_ratio/min": 0.40798985958099365, + "sampling/sampling_logp_difference/max": 0.8965129852294922, + "sampling/sampling_logp_difference/mean": 0.017877453938126564, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 172.296875, + "completions/mean_terminated_length": 172.296875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.21682016551494598, + "epoch": 0.013480392156862746, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.7792153496967664, + "kl": 0.0012470419751480222, + "learning_rate": 4.065040650406504e-08, + "loss": -0.0067, + "num_tokens": 345983.0, + "reward": 0.09375, + "reward_std": 0.686570405960083, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997990131378174, + "sampling/importance_sampling_ratio/min": 0.15214312076568604, + "sampling/sampling_logp_difference/max": 1.8829336166381836, + "sampling/sampling_logp_difference/mean": 0.018758177757263184, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 186.734375, + "completions/mean_terminated_length": 186.734375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.18953701853752136, + "epoch": 0.014705882352941176, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.035123873225246, + "kl": 0.0011587527114897966, + "learning_rate": 4.4715447154471546e-08, + "loss": 0.0796, + "num_tokens": 374830.0, + "reward": 0.59375, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007586479187012, + "sampling/importance_sampling_ratio/min": 0.3381035625934601, + "sampling/sampling_logp_difference/max": 1.0844030380249023, + "sampling/sampling_logp_difference/mean": 0.014523649588227272, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 172.078125, + "completions/mean_terminated_length": 172.078125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.1656114161014557, + "epoch": 0.015931372549019607, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.417069846130118, + "kl": 0.000998029951006174, + "learning_rate": 4.878048780487805e-08, + "loss": 0.0026, + "num_tokens": 400211.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995077848434448, + "sampling/importance_sampling_ratio/min": 0.2681380808353424, + "sampling/sampling_logp_difference/max": 1.3162531852722168, + "sampling/sampling_logp_difference/mean": 0.01516179833561182, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 176.3125, + "completions/mean_terminated_length": 176.3125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.16202491521835327, + "epoch": 0.01715686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.213437710938307, + "kl": 0.0008014775812625885, + "learning_rate": 5.2845528455284554e-08, + "loss": -0.0109, + "num_tokens": 426503.0, + "reward": 0.25, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.8623807430267334, + "sampling/importance_sampling_ratio/mean": 0.9993404746055603, + "sampling/importance_sampling_ratio/min": 0.0011495015351101756, + "sampling/sampling_logp_difference/max": 6.768426895141602, + "sampling/sampling_logp_difference/mean": 0.01480356976389885, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 147.390625, + "completions/mean_terminated_length": 147.390625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.12848389148712158, + "epoch": 0.01838235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008026646180956696, + "kl": 0.000677362666465342, + "learning_rate": 5.6910569105691055e-08, + "loss": 0.0, + "num_tokens": 453488.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6391452550888062, + "sampling/importance_sampling_ratio/mean": 0.9992384910583496, + "sampling/importance_sampling_ratio/min": 0.31635162234306335, + "sampling/sampling_logp_difference/max": 1.150900959968567, + "sampling/sampling_logp_difference/mean": 0.011211428791284561, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 249.453125, + "completions/mean_terminated_length": 249.453125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.23030908405780792, + "epoch": 0.0196078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4556318068591032, + "kl": 0.0008972191135399044, + "learning_rate": 6.097560975609756e-08, + "loss": 0.0407, + "num_tokens": 495101.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994782209396362, + "sampling/importance_sampling_ratio/min": 0.3743397891521454, + "sampling/sampling_logp_difference/max": 0.9825913906097412, + "sampling/sampling_logp_difference/mean": 0.017919588834047318, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 155.734375, + "completions/mean_terminated_length": 155.734375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.18947651982307434, + "epoch": 0.020833333333333332, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.904796813624881, + "kl": 0.0007599001983180642, + "learning_rate": 6.504065040650406e-08, + "loss": 0.0041, + "num_tokens": 521468.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.9906654357910156, + "sampling/importance_sampling_ratio/mean": 0.9998794794082642, + "sampling/importance_sampling_ratio/min": 0.4074186682701111, + "sampling/sampling_logp_difference/max": 0.897913932800293, + "sampling/sampling_logp_difference/mean": 0.015055290423333645, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 188.46875, + "completions/mean_terminated_length": 188.46875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.18024782836437225, + "epoch": 0.022058823529411766, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5232328359620357, + "kl": 0.0013627070002257824, + "learning_rate": 6.910569105691057e-08, + "loss": -0.0298, + "num_tokens": 548234.0, + "reward": 0.25, + "reward_std": 0.6494960784912109, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007586479187012, + "sampling/importance_sampling_ratio/min": 0.09287115931510925, + "sampling/sampling_logp_difference/max": 2.376542091369629, + "sampling/sampling_logp_difference/mean": 0.016794255003333092, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 220.34375, + "completions/mean_terminated_length": 220.34375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.20761245489120483, + "epoch": 0.023284313725490197, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2247091470788702, + "kl": 0.0009089668747037649, + "learning_rate": 7.317073170731706e-08, + "loss": -0.0216, + "num_tokens": 582880.0, + "reward": 0.8125, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.8837299346923828, + "sampling/importance_sampling_ratio/mean": 0.9993168115615845, + "sampling/importance_sampling_ratio/min": 0.28969964385032654, + "sampling/sampling_logp_difference/max": 1.2389106750488281, + "sampling/sampling_logp_difference/mean": 0.016906775534152985, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 167.796875, + "completions/mean_terminated_length": 167.796875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.1678135246038437, + "epoch": 0.024509803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2937421687772095, + "kl": 0.0010125580010935664, + "learning_rate": 7.723577235772358e-08, + "loss": -0.0078, + "num_tokens": 613267.0, + "reward": 0.40625, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992740750312805, + "sampling/importance_sampling_ratio/min": 0.22527435421943665, + "sampling/sampling_logp_difference/max": 1.490436315536499, + "sampling/sampling_logp_difference/mean": 0.01392899826169014, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 176.875, + "completions/mean_terminated_length": 176.875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.19384238123893738, + "epoch": 0.025735294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.324882947640868, + "kl": 0.0007086207042448223, + "learning_rate": 8.130081300813008e-08, + "loss": 0.007, + "num_tokens": 642203.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 0.5038077235221863, + "sampling/sampling_logp_difference/max": 0.8750619888305664, + "sampling/sampling_logp_difference/mean": 0.013608230277895927, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 175.171875, + "completions/mean_terminated_length": 175.171875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.17411136627197266, + "epoch": 0.02696078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4113258242602515, + "kl": 0.0009255572222173214, + "learning_rate": 8.536585365853659e-08, + "loss": 0.0121, + "num_tokens": 673318.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998393058776855, + "sampling/importance_sampling_ratio/min": 0.5363104939460754, + "sampling/sampling_logp_difference/max": 0.7736172676086426, + "sampling/sampling_logp_difference/mean": 0.013956751674413681, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 211.4375, + "completions/mean_terminated_length": 211.4375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2113991677761078, + "epoch": 0.028186274509803922, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2963084099370983, + "kl": 0.001079519628547132, + "learning_rate": 8.943089430894309e-08, + "loss": -0.0252, + "num_tokens": 706850.0, + "reward": 0.28125, + "reward_std": 0.7561737298965454, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994499683380127, + "sampling/importance_sampling_ratio/min": 0.3145255744457245, + "sampling/sampling_logp_difference/max": 1.1566898822784424, + "sampling/sampling_logp_difference/mean": 0.016110863536596298, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 178.125, + "completions/mean_terminated_length": 178.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.18932907283306122, + "epoch": 0.029411764705882353, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.4792697662005376, + "kl": 0.0008806021651253104, + "learning_rate": 9.349593495934959e-08, + "loss": 0.0367, + "num_tokens": 734266.0, + "reward": 0.6875, + "reward_std": 0.5879635810852051, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997655153274536, + "sampling/importance_sampling_ratio/min": 0.37791651487350464, + "sampling/sampling_logp_difference/max": 0.9730819463729858, + "sampling/sampling_logp_difference/mean": 0.015410843305289745, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 218.546875, + "completions/mean_terminated_length": 218.546875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.256826788187027, + "epoch": 0.030637254901960783, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.078362603626218, + "kl": 0.0011094619985669851, + "learning_rate": 9.75609756097561e-08, + "loss": 0.0062, + "num_tokens": 768893.0, + "reward": -0.625, + "reward_std": 0.481805682182312, + "rewards/decision_reward_func/mean": -0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6925945281982422, + "sampling/importance_sampling_ratio/mean": 0.9995798468589783, + "sampling/importance_sampling_ratio/min": 0.3771505653858185, + "sampling/sampling_logp_difference/max": 0.9751107692718506, + "sampling/sampling_logp_difference/mean": 0.018611162900924683, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 201.328125, + "completions/mean_terminated_length": 201.328125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.1686311811208725, + "epoch": 0.031862745098039214, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7713450102289707, + "kl": 0.0007161884568631649, + "learning_rate": 1.016260162601626e-07, + "loss": -0.0072, + "num_tokens": 798258.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5744632482528687, + "sampling/importance_sampling_ratio/mean": 1.0003325939178467, + "sampling/importance_sampling_ratio/min": 0.5094258785247803, + "sampling/sampling_logp_difference/max": 0.6744709014892578, + "sampling/sampling_logp_difference/mean": 0.012171566486358643, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 159.234375, + "completions/mean_terminated_length": 159.234375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.18077212572097778, + "epoch": 0.03308823529411765, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.331492715904185, + "kl": 0.001058193389326334, + "learning_rate": 1.0569105691056911e-07, + "loss": -0.0145, + "num_tokens": 830337.0, + "reward": 0.71875, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.7081413269042969, + "sampling/importance_sampling_ratio/mean": 0.9999635219573975, + "sampling/importance_sampling_ratio/min": 0.6173048615455627, + "sampling/sampling_logp_difference/max": 0.5354058742523193, + "sampling/sampling_logp_difference/mean": 0.015099374577403069, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 176.4375, + "completions/mean_terminated_length": 176.4375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.23961573839187622, + "epoch": 0.03431372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.416952272772911, + "kl": 0.001167132519185543, + "learning_rate": 1.097560975609756e-07, + "loss": 0.0155, + "num_tokens": 858221.0, + "reward": 0.1875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.9112063646316528, + "sampling/importance_sampling_ratio/mean": 0.9995735883712769, + "sampling/importance_sampling_ratio/min": 0.4739725589752197, + "sampling/sampling_logp_difference/max": 0.7466058731079102, + "sampling/sampling_logp_difference/mean": 0.01782134920358658, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 162.109375, + "completions/mean_terminated_length": 162.109375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2027626484632492, + "epoch": 0.03553921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1486452460862857, + "kl": 0.0009120207978412509, + "learning_rate": 1.1382113821138211e-07, + "loss": 0.0393, + "num_tokens": 886804.0, + "reward": 0.5625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.608196496963501, + "sampling/importance_sampling_ratio/mean": 0.9993435144424438, + "sampling/importance_sampling_ratio/min": 0.4156869351863861, + "sampling/sampling_logp_difference/max": 0.8778228759765625, + "sampling/sampling_logp_difference/mean": 0.016218479722738266, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 175.296875, + "completions/mean_terminated_length": 175.296875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.20189282298088074, + "epoch": 0.03676470588235294, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.42404206856289, + "kl": 0.0011729395482689142, + "learning_rate": 1.1788617886178862e-07, + "loss": -0.0321, + "num_tokens": 911463.0, + "reward": 0.09375, + "reward_std": 0.8273203372955322, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996466636657715, + "sampling/importance_sampling_ratio/min": 0.24144580960273743, + "sampling/sampling_logp_difference/max": 1.4211102724075317, + "sampling/sampling_logp_difference/mean": 0.01663786731660366, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 242.421875, + "completions/mean_terminated_length": 242.421875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.1681455820798874, + "epoch": 0.03799019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7403563125874317, + "kl": 0.009450777433812618, + "learning_rate": 1.219512195121951e-07, + "loss": 0.0433, + "num_tokens": 941970.0, + "reward": 0.71875, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996355772018433, + "sampling/importance_sampling_ratio/min": 0.008333723060786724, + "sampling/sampling_logp_difference/max": 4.787445068359375, + "sampling/sampling_logp_difference/mean": 0.015643514692783356, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 160.578125, + "completions/mean_terminated_length": 160.578125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.14362278580665588, + "epoch": 0.0392156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5064949900405952, + "kl": 0.0027769720181822777, + "learning_rate": 1.260162601626016e-07, + "loss": 0.0, + "num_tokens": 977127.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999289512634277, + "sampling/importance_sampling_ratio/min": 0.049872394651174545, + "sampling/sampling_logp_difference/max": 2.9982876777648926, + "sampling/sampling_logp_difference/mean": 0.013318167068064213, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 237.484375, + "completions/mean_terminated_length": 237.484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.17521731555461884, + "epoch": 0.04044117647058824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008297787487699215, + "kl": 0.0007624527206644416, + "learning_rate": 1.3008130081300813e-07, + "loss": 0.0, + "num_tokens": 1011142.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6099501848220825, + "sampling/importance_sampling_ratio/mean": 0.9996030926704407, + "sampling/importance_sampling_ratio/min": 0.3408905267715454, + "sampling/sampling_logp_difference/max": 1.076193928718567, + "sampling/sampling_logp_difference/mean": 0.012307427823543549, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 222.578125, + "completions/mean_terminated_length": 222.578125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.20559880137443542, + "epoch": 0.041666666666666664, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.013657529944585, + "kl": 0.000846860115416348, + "learning_rate": 1.3414634146341465e-07, + "loss": 0.0086, + "num_tokens": 1045355.0, + "reward": 0.0, + "reward_std": 0.6143567562103271, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8357250690460205, + "sampling/importance_sampling_ratio/mean": 0.9997804164886475, + "sampling/importance_sampling_ratio/min": 0.2929272949695587, + "sampling/sampling_logp_difference/max": 1.2278308868408203, + "sampling/sampling_logp_difference/mean": 0.014453263953328133, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 169.34375, + "completions/mean_terminated_length": 169.34375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.19829398393630981, + "epoch": 0.0428921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.505141941859476, + "kl": 0.0011524150613695383, + "learning_rate": 1.3821138211382114e-07, + "loss": 0.0141, + "num_tokens": 1075969.0, + "reward": -0.15625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999667763710022, + "sampling/importance_sampling_ratio/min": 0.22803999483585358, + "sampling/sampling_logp_difference/max": 1.4782342910766602, + "sampling/sampling_logp_difference/mean": 0.016655966639518738, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 205.53125, + "completions/mean_terminated_length": 205.53125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.19215545058250427, + "epoch": 0.04411764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4767662046376377, + "kl": 0.0009019374847412109, + "learning_rate": 1.4227642276422763e-07, + "loss": 0.0085, + "num_tokens": 1109795.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.8890724182128906, + "sampling/importance_sampling_ratio/mean": 1.0002727508544922, + "sampling/importance_sampling_ratio/min": 0.37968289852142334, + "sampling/sampling_logp_difference/max": 0.9684188365936279, + "sampling/sampling_logp_difference/mean": 0.01590348407626152, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 235.0, + "completions/mean_terminated_length": 235.0, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.270089328289032, + "epoch": 0.04534313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3259173987424338, + "kl": 0.0008042281260713935, + "learning_rate": 1.4634146341463413e-07, + "loss": -0.0085, + "num_tokens": 1143731.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002659559249878, + "sampling/importance_sampling_ratio/min": 0.5362957715988159, + "sampling/sampling_logp_difference/max": 0.9607458114624023, + "sampling/sampling_logp_difference/mean": 0.017081955447793007, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 165.03125, + "completions/mean_terminated_length": 165.03125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.20545266568660736, + "epoch": 0.04656862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5826552693603972, + "kl": 0.0007720981957390904, + "learning_rate": 1.5040650406504065e-07, + "loss": -0.0142, + "num_tokens": 1175077.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994447231292725, + "sampling/importance_sampling_ratio/min": 0.459695965051651, + "sampling/sampling_logp_difference/max": 0.7771899700164795, + "sampling/sampling_logp_difference/mean": 0.015556670725345612, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 229.265625, + "completions/mean_terminated_length": 229.265625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.2712585926055908, + "epoch": 0.04779411764705882, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.353506040765074, + "kl": 0.0009125759825110435, + "learning_rate": 1.5447154471544717e-07, + "loss": -0.0731, + "num_tokens": 1204102.0, + "reward": 0.0625, + "reward_std": 0.6663130521774292, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7431355714797974, + "sampling/importance_sampling_ratio/mean": 1.000112771987915, + "sampling/importance_sampling_ratio/min": 0.3292010724544525, + "sampling/sampling_logp_difference/max": 1.1110866069793701, + "sampling/sampling_logp_difference/mean": 0.019008290022611618, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 162.796875, + "completions/mean_terminated_length": 162.796875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.20898710191249847, + "epoch": 0.049019607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.6237440741233096, + "kl": 0.0029189821798354387, + "learning_rate": 1.5853658536585366e-07, + "loss": -0.0096, + "num_tokens": 1232585.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992484450340271, + "sampling/importance_sampling_ratio/min": 0.04195699468255043, + "sampling/sampling_logp_difference/max": 3.171110153198242, + "sampling/sampling_logp_difference/mean": 0.0167709868401289, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 205.0625, + "completions/mean_terminated_length": 205.0625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.2982996702194214, + "epoch": 0.05024509803921569, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.799568407655128, + "kl": 0.0009631913271732628, + "learning_rate": 1.6260162601626016e-07, + "loss": 0.0097, + "num_tokens": 1261949.0, + "reward": -0.21875, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994151592254639, + "sampling/importance_sampling_ratio/min": 0.5006861686706543, + "sampling/sampling_logp_difference/max": 0.8760700225830078, + "sampling/sampling_logp_difference/mean": 0.01940348744392395, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 183.546875, + "completions/mean_terminated_length": 183.546875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.15424934029579163, + "epoch": 0.051470588235294115, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7897617291631476, + "kl": 0.0008602460147812963, + "learning_rate": 1.6666666666666665e-07, + "loss": -0.0108, + "num_tokens": 1291360.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0012483596801758, + "sampling/importance_sampling_ratio/min": 0.3941444158554077, + "sampling/sampling_logp_difference/max": 0.9310379028320312, + "sampling/sampling_logp_difference/mean": 0.013946986757218838, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 150.640625, + "completions/mean_terminated_length": 150.640625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.14799076318740845, + "epoch": 0.05269607843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.3545311634479824, + "kl": 0.001798890414647758, + "learning_rate": 1.7073170731707317e-07, + "loss": 0.0144, + "num_tokens": 1315833.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005621910095215, + "sampling/importance_sampling_ratio/min": 0.0856546014547348, + "sampling/sampling_logp_difference/max": 2.457432270050049, + "sampling/sampling_logp_difference/mean": 0.01602208986878395, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 191.59375, + "completions/mean_terminated_length": 191.59375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.21230532228946686, + "epoch": 0.05392156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.311298023398624, + "kl": 0.0005751050775870681, + "learning_rate": 1.7479674796747966e-07, + "loss": -0.0063, + "num_tokens": 1346527.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.7553319931030273, + "sampling/importance_sampling_ratio/mean": 0.9997537136077881, + "sampling/importance_sampling_ratio/min": 0.4791052043437958, + "sampling/sampling_logp_difference/max": 0.735835075378418, + "sampling/sampling_logp_difference/mean": 0.013450969010591507, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 206.921875, + "completions/mean_terminated_length": 206.921875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2584640681743622, + "epoch": 0.05514705882352941, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.588613093482011, + "kl": 0.0011465022107586265, + "learning_rate": 1.7886178861788619e-07, + "loss": -0.0703, + "num_tokens": 1389658.0, + "reward": 0.5, + "reward_std": 0.6116957664489746, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003669261932373, + "sampling/importance_sampling_ratio/min": 0.0739443451166153, + "sampling/sampling_logp_difference/max": 2.604442596435547, + "sampling/sampling_logp_difference/mean": 0.018720664083957672, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 149.28125, + "completions/mean_terminated_length": 149.28125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.13874804973602295, + "epoch": 0.056372549019607844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01525103126378843, + "kl": 0.001152846380136907, + "learning_rate": 1.8292682926829268e-07, + "loss": 0.0, + "num_tokens": 1411132.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.894408106803894, + "sampling/importance_sampling_ratio/mean": 0.9993637800216675, + "sampling/importance_sampling_ratio/min": 0.3645811975002289, + "sampling/sampling_logp_difference/max": 1.0090060234069824, + "sampling/sampling_logp_difference/mean": 0.013426810503005981, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 223.140625, + "completions/mean_terminated_length": 223.140625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.20380157232284546, + "epoch": 0.05759803921568627, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.266503743158167, + "kl": 0.0008492980268783867, + "learning_rate": 1.8699186991869917e-07, + "loss": -0.0133, + "num_tokens": 1441365.0, + "reward": 0.15625, + "reward_std": 0.519389271736145, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.781062364578247, + "sampling/importance_sampling_ratio/mean": 1.0001952648162842, + "sampling/importance_sampling_ratio/min": 0.301305890083313, + "sampling/sampling_logp_difference/max": 1.1996291875839233, + "sampling/sampling_logp_difference/mean": 0.014757132157683372, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 239.140625, + "completions/mean_terminated_length": 239.140625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.1466808319091797, + "epoch": 0.058823529411764705, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.1656890291326536, + "kl": 0.000727055361494422, + "learning_rate": 1.910569105691057e-07, + "loss": 0.0332, + "num_tokens": 1471422.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997957944869995, + "sampling/importance_sampling_ratio/min": 0.2538463771343231, + "sampling/sampling_logp_difference/max": 1.3710259199142456, + "sampling/sampling_logp_difference/mean": 0.011648212559521198, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 148.03125, + "completions/mean_terminated_length": 148.03125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.22562922537326813, + "epoch": 0.06004901960784314, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.96917802973487, + "kl": 0.0011090695625171065, + "learning_rate": 1.951219512195122e-07, + "loss": 0.0079, + "num_tokens": 1499280.0, + "reward": 0.3125, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000557899475098, + "sampling/importance_sampling_ratio/min": 0.430244505405426, + "sampling/sampling_logp_difference/max": 0.8434016704559326, + "sampling/sampling_logp_difference/mean": 0.01785694807767868, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 146.265625, + "completions/mean_terminated_length": 146.265625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.17815472185611725, + "epoch": 0.061274509803921566, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.046318973647959, + "kl": 0.0008414412150159478, + "learning_rate": 1.9918699186991868e-07, + "loss": 0.0334, + "num_tokens": 1525313.0, + "reward": 0.5, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007548332214355, + "sampling/importance_sampling_ratio/min": 0.3567883372306824, + "sampling/sampling_logp_difference/max": 1.030612587928772, + "sampling/sampling_logp_difference/mean": 0.0150204598903656, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 161.265625, + "completions/mean_terminated_length": 161.265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.21071355044841766, + "epoch": 0.0625, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.8969991490441087, + "kl": 0.0011610030196607113, + "learning_rate": 2.032520325203252e-07, + "loss": -0.0127, + "num_tokens": 1553170.0, + "reward": 0.71875, + "reward_std": 0.565913200378418, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993498921394348, + "sampling/importance_sampling_ratio/min": 0.48236799240112305, + "sampling/sampling_logp_difference/max": 0.7290480136871338, + "sampling/sampling_logp_difference/mean": 0.01719040796160698, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 235.4375, + "completions/mean_terminated_length": 235.4375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.26339125633239746, + "epoch": 0.06372549019607843, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5234770083205347, + "kl": 0.001336791436187923, + "learning_rate": 2.073170731707317e-07, + "loss": 0.0031, + "num_tokens": 1594062.0, + "reward": 0.3125, + "reward_std": 0.5847553014755249, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996271729469299, + "sampling/importance_sampling_ratio/min": 0.170707106590271, + "sampling/sampling_logp_difference/max": 1.767806053161621, + "sampling/sampling_logp_difference/mean": 0.021252326667308807, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 161.125, + "completions/mean_terminated_length": 161.125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.23058223724365234, + "epoch": 0.06495098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.943638404142557, + "kl": 0.0011498222593218088, + "learning_rate": 2.1138211382113822e-07, + "loss": 0.001, + "num_tokens": 1625990.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000584363937378, + "sampling/importance_sampling_ratio/min": 0.4440935552120209, + "sampling/sampling_logp_difference/max": 1.2157726287841797, + "sampling/sampling_logp_difference/mean": 0.017671234905719757, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 176.78125, + "completions/mean_terminated_length": 176.78125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2276969850063324, + "epoch": 0.0661764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9465295598557404, + "kl": 0.0011563922744244337, + "learning_rate": 2.154471544715447e-07, + "loss": -0.0184, + "num_tokens": 1652504.0, + "reward": -0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000014305114746, + "sampling/importance_sampling_ratio/min": 0.3149414360523224, + "sampling/sampling_logp_difference/max": 1.1553685665130615, + "sampling/sampling_logp_difference/mean": 0.016492661088705063, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 188.25, + "completions/mean_terminated_length": 188.25, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2518764138221741, + "epoch": 0.06740196078431372, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0451391458360093, + "kl": 0.0009489314979873598, + "learning_rate": 2.195121951219512e-07, + "loss": -0.0357, + "num_tokens": 1689400.0, + "reward": 0.40625, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.623826026916504, + "sampling/importance_sampling_ratio/mean": 1.0000336170196533, + "sampling/importance_sampling_ratio/min": 0.3736537992954254, + "sampling/sampling_logp_difference/max": 0.9844256639480591, + "sampling/sampling_logp_difference/mean": 0.018169749528169632, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 184.953125, + "completions/mean_terminated_length": 184.953125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.15880240499973297, + "epoch": 0.06862745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015698177885746345, + "kl": 0.0010644992580637336, + "learning_rate": 2.235772357723577e-07, + "loss": 0.0, + "num_tokens": 1720149.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995625019073486, + "sampling/importance_sampling_ratio/min": 0.29187583923339844, + "sampling/sampling_logp_difference/max": 1.23142671585083, + "sampling/sampling_logp_difference/mean": 0.014251098968088627, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 189.859375, + "completions/mean_terminated_length": 189.859375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.20032057166099548, + "epoch": 0.06985294117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9733158470984498, + "kl": 0.001350488979369402, + "learning_rate": 2.2764227642276422e-07, + "loss": -0.0297, + "num_tokens": 1749228.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.98750638961792, + "sampling/importance_sampling_ratio/mean": 1.0003879070281982, + "sampling/importance_sampling_ratio/min": 0.36696475744247437, + "sampling/sampling_logp_difference/max": 1.0024895668029785, + "sampling/sampling_logp_difference/mean": 0.01593562588095665, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 169.125, + "completions/mean_terminated_length": 169.125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.19529198110103607, + "epoch": 0.07107843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.794206828484955, + "kl": 0.0013890969567000866, + "learning_rate": 2.3170731707317074e-07, + "loss": -0.0165, + "num_tokens": 1773924.0, + "reward": -0.15625, + "reward_std": 0.4597553312778473, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998757243156433, + "sampling/importance_sampling_ratio/min": 0.4793126881122589, + "sampling/sampling_logp_difference/max": 0.7354021072387695, + "sampling/sampling_logp_difference/mean": 0.015716655179858208, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 189.890625, + "completions/mean_terminated_length": 189.890625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.142284095287323, + "epoch": 0.07230392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2566331067579215, + "kl": 0.0008152315858751535, + "learning_rate": 2.3577235772357723e-07, + "loss": -0.0102, + "num_tokens": 1800413.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002665519714355, + "sampling/importance_sampling_ratio/min": 0.459773987531662, + "sampling/sampling_logp_difference/max": 1.1954364776611328, + "sampling/sampling_logp_difference/mean": 0.011489486321806908, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 183.1875, + "completions/mean_terminated_length": 183.1875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.24612563848495483, + "epoch": 0.07352941176470588, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.469674402816741, + "kl": 0.0011279808823019266, + "learning_rate": 2.3983739837398373e-07, + "loss": -0.0085, + "num_tokens": 1826601.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992374181747437, + "sampling/importance_sampling_ratio/min": 0.3642270565032959, + "sampling/sampling_logp_difference/max": 1.0099778175354004, + "sampling/sampling_logp_difference/mean": 0.01788927987217903, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 196.609375, + "completions/mean_terminated_length": 196.609375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.18494541943073273, + "epoch": 0.07475490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.342195339421481, + "kl": 0.0011641057208180428, + "learning_rate": 2.439024390243902e-07, + "loss": 0.0125, + "num_tokens": 1859008.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.923892617225647, + "sampling/importance_sampling_ratio/mean": 1.000486135482788, + "sampling/importance_sampling_ratio/min": 0.24550510942935944, + "sampling/sampling_logp_difference/max": 1.40443754196167, + "sampling/sampling_logp_difference/mean": 0.01435195654630661, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 178.765625, + "completions/mean_terminated_length": 178.765625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.17995566129684448, + "epoch": 0.07598039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.266713412472842, + "kl": 0.0013068044790998101, + "learning_rate": 2.479674796747967e-07, + "loss": 0.0098, + "num_tokens": 1888129.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002384185791016, + "sampling/importance_sampling_ratio/min": 0.336388498544693, + "sampling/sampling_logp_difference/max": 1.1315093040466309, + "sampling/sampling_logp_difference/mean": 0.016184909269213676, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 151.484375, + "completions/mean_terminated_length": 151.484375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.18587751686573029, + "epoch": 0.07720588235294118, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.4786554405848515, + "kl": 0.0009879636345431209, + "learning_rate": 2.520325203252032e-07, + "loss": -0.0322, + "num_tokens": 1912080.0, + "reward": -0.53125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": -0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.9112218618392944, + "sampling/importance_sampling_ratio/mean": 1.0004534721374512, + "sampling/importance_sampling_ratio/min": 0.23113895952701569, + "sampling/sampling_logp_difference/max": 1.4647362232208252, + "sampling/sampling_logp_difference/mean": 0.015230939723551273, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 151.78125, + "completions/mean_terminated_length": 151.78125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.24000594019889832, + "epoch": 0.0784313725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.96170069401628, + "kl": 0.0012934945989400148, + "learning_rate": 2.5609756097560976e-07, + "loss": 0.0851, + "num_tokens": 1942722.0, + "reward": 0.3125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.3903812766075134, + "sampling/sampling_logp_difference/max": 0.9406313896179199, + "sampling/sampling_logp_difference/mean": 0.017442770302295685, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 224.609375, + "completions/mean_terminated_length": 224.609375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.17163443565368652, + "epoch": 0.07965686274509803, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.843371466059802, + "kl": 0.0009480844018980861, + "learning_rate": 2.6016260162601625e-07, + "loss": -0.0052, + "num_tokens": 1976409.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994733333587646, + "sampling/importance_sampling_ratio/min": 0.4611442983150482, + "sampling/sampling_logp_difference/max": 0.7740442752838135, + "sampling/sampling_logp_difference/mean": 0.012315905652940273, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 210.75, + "completions/mean_terminated_length": 210.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.21215350925922394, + "epoch": 0.08088235294117647, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.213617107549737, + "kl": 0.0016216032672673464, + "learning_rate": 2.6422764227642274e-07, + "loss": 0.0318, + "num_tokens": 2016729.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005450248718262, + "sampling/importance_sampling_ratio/min": 0.3727061450481415, + "sampling/sampling_logp_difference/max": 0.986965000629425, + "sampling/sampling_logp_difference/mean": 0.01834452524781227, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 146.921875, + "completions/mean_terminated_length": 146.921875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.18147198855876923, + "epoch": 0.0821078431372549, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.293958099941274, + "kl": 0.0013055060990154743, + "learning_rate": 2.682926829268293e-07, + "loss": 0.0432, + "num_tokens": 2042116.0, + "reward": 0.375, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6976975202560425, + "sampling/importance_sampling_ratio/mean": 0.9998291730880737, + "sampling/importance_sampling_ratio/min": 0.4755859076976776, + "sampling/sampling_logp_difference/max": 0.7432076930999756, + "sampling/sampling_logp_difference/mean": 0.016954217106103897, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 198.4375, + "completions/mean_terminated_length": 198.4375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.22552774846553802, + "epoch": 0.08333333333333333, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.98609466690851, + "kl": 0.0016288069309666753, + "learning_rate": 2.7235772357723573e-07, + "loss": 0.0065, + "num_tokens": 2080208.0, + "reward": 0.6875, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.7076270580291748, + "sampling/importance_sampling_ratio/mean": 0.9996429085731506, + "sampling/importance_sampling_ratio/min": 0.23136155307292938, + "sampling/sampling_logp_difference/max": 1.4637736082077026, + "sampling/sampling_logp_difference/mean": 0.01801292598247528, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 200.890625, + "completions/mean_terminated_length": 200.890625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.23393692076206207, + "epoch": 0.08455882352941177, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.515330783548267, + "kl": 0.0011547683971002698, + "learning_rate": 2.764227642276423e-07, + "loss": 0.0033, + "num_tokens": 2109545.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998908042907715, + "sampling/importance_sampling_ratio/min": 0.4810388684272766, + "sampling/sampling_logp_difference/max": 0.8235739469528198, + "sampling/sampling_logp_difference/mean": 0.017789803445339203, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 172.078125, + "completions/mean_terminated_length": 172.078125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.25954240560531616, + "epoch": 0.0857843137254902, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7719277208100754, + "kl": 0.001475530443713069, + "learning_rate": 2.8048780487804877e-07, + "loss": -0.0355, + "num_tokens": 2143198.0, + "reward": 0.3125, + "reward_std": 0.7059217691421509, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6989006996154785, + "sampling/importance_sampling_ratio/mean": 0.9999538660049438, + "sampling/importance_sampling_ratio/min": 0.37945547699928284, + "sampling/sampling_logp_difference/max": 0.9690179824829102, + "sampling/sampling_logp_difference/mean": 0.017497912049293518, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.17055967450141907, + "epoch": 0.08700980392156862, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0908010404916784, + "kl": 0.0010060817003250122, + "learning_rate": 2.8455284552845527e-07, + "loss": -0.1028, + "num_tokens": 2168814.0, + "reward": 0.46875, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8770861625671387, + "sampling/importance_sampling_ratio/mean": 0.9999717473983765, + "sampling/importance_sampling_ratio/min": 0.5698131918907166, + "sampling/sampling_logp_difference/max": 0.6297206878662109, + "sampling/sampling_logp_difference/mean": 0.011747198179364204, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 210.21875, + "completions/mean_terminated_length": 210.21875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2961871027946472, + "epoch": 0.08823529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.827403572423837, + "kl": 0.0019968498963862658, + "learning_rate": 2.886178861788618e-07, + "loss": 0.0022, + "num_tokens": 2198684.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.807397484779358, + "sampling/importance_sampling_ratio/mean": 1.00038743019104, + "sampling/importance_sampling_ratio/min": 0.33141231536865234, + "sampling/sampling_logp_difference/max": 1.1043920516967773, + "sampling/sampling_logp_difference/mean": 0.019413597881793976, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 253.5625, + "completions/mean_terminated_length": 253.5625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2353845089673996, + "epoch": 0.08946078431372549, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1551761848321918, + "kl": 0.001148139126598835, + "learning_rate": 2.9268292682926825e-07, + "loss": 0.0357, + "num_tokens": 2233728.0, + "reward": -0.3125, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6350189447402954, + "sampling/importance_sampling_ratio/mean": 0.9997013807296753, + "sampling/importance_sampling_ratio/min": 0.42648062109947205, + "sampling/sampling_logp_difference/max": 0.8521883487701416, + "sampling/sampling_logp_difference/mean": 0.01445393543690443, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 208.71875, + "completions/mean_terminated_length": 208.71875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.17318938672542572, + "epoch": 0.09068627450980392, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7886602733080035, + "kl": 0.001269905362278223, + "learning_rate": 2.967479674796748e-07, + "loss": 0.0193, + "num_tokens": 2260334.0, + "reward": 0.15625, + "reward_std": 0.4597553312778473, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.7259013652801514, + "sampling/importance_sampling_ratio/mean": 0.9998394250869751, + "sampling/importance_sampling_ratio/min": 0.39690473675727844, + "sampling/sampling_logp_difference/max": 0.9240590333938599, + "sampling/sampling_logp_difference/mean": 0.014188411645591259, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 203.5625, + "completions/mean_terminated_length": 203.5625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.20890933275222778, + "epoch": 0.09191176470588236, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.684631997120434, + "kl": 0.0016565986443310976, + "learning_rate": 3.008130081300813e-07, + "loss": 0.001, + "num_tokens": 2290786.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6600470542907715, + "sampling/importance_sampling_ratio/mean": 0.9995492696762085, + "sampling/importance_sampling_ratio/min": 0.336929053068161, + "sampling/sampling_logp_difference/max": 1.0878829956054688, + "sampling/sampling_logp_difference/mean": 0.015820588916540146, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 168.46875, + "completions/mean_terminated_length": 168.46875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.16517484188079834, + "epoch": 0.09313725490196079, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.299386141823514, + "kl": 0.001505703548900783, + "learning_rate": 3.048780487804878e-07, + "loss": 0.0102, + "num_tokens": 2315920.0, + "reward": -0.28125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821186065674, + "sampling/importance_sampling_ratio/min": 0.2679966688156128, + "sampling/sampling_logp_difference/max": 1.3167808055877686, + "sampling/sampling_logp_difference/mean": 0.013711988925933838, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 208.59375, + "completions/mean_terminated_length": 208.59375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.1949157863855362, + "epoch": 0.09436274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2605270490870344, + "kl": 0.0017528904136270285, + "learning_rate": 3.0894308943089434e-07, + "loss": -0.0173, + "num_tokens": 2346630.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999854326248169, + "sampling/importance_sampling_ratio/min": 0.29550692439079285, + "sampling/sampling_logp_difference/max": 1.2540783882141113, + "sampling/sampling_logp_difference/mean": 0.015238778665661812, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 251.453125, + "completions/mean_terminated_length": 251.453125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.20913855731487274, + "epoch": 0.09558823529411764, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0026285862178974, + "kl": 0.0016085922252386808, + "learning_rate": 3.130081300813008e-07, + "loss": -0.0103, + "num_tokens": 2377267.0, + "reward": -0.28125, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0010101795196533, + "sampling/importance_sampling_ratio/min": 0.5038118958473206, + "sampling/sampling_logp_difference/max": 0.7567176818847656, + "sampling/sampling_logp_difference/mean": 0.015605229884386063, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 197.8125, + "completions/mean_terminated_length": 197.8125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.24096760153770447, + "epoch": 0.09681372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.7601953560179315, + "kl": 0.0015045834006741643, + "learning_rate": 3.170731707317073e-07, + "loss": 0.0355, + "num_tokens": 2411895.0, + "reward": 0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.47943076491355896, + "sampling/sampling_logp_difference/max": 0.8563418388366699, + "sampling/sampling_logp_difference/mean": 0.016151513904333115, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 186.90625, + "completions/mean_terminated_length": 186.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.22716647386550903, + "epoch": 0.09803921568627451, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.111168991316798, + "kl": 0.002549254335463047, + "learning_rate": 3.211382113821138e-07, + "loss": 0.0048, + "num_tokens": 2443393.0, + "reward": 0.5, + "reward_std": 0.843070387840271, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998606443405151, + "sampling/importance_sampling_ratio/min": 0.24906150996685028, + "sampling/sampling_logp_difference/max": 1.3900554180145264, + "sampling/sampling_logp_difference/mean": 0.018136531114578247, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 135.546875, + "completions/mean_terminated_length": 135.546875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.17298495769500732, + "epoch": 0.09926470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.258324059124309, + "kl": 0.002044468652456999, + "learning_rate": 3.252032520325203e-07, + "loss": 0.0254, + "num_tokens": 2470196.0, + "reward": 0.75, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005346536636353, + "sampling/importance_sampling_ratio/min": 0.458995521068573, + "sampling/sampling_logp_difference/max": 0.9160494804382324, + "sampling/sampling_logp_difference/mean": 0.01386941596865654, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 184.25, + "completions/mean_terminated_length": 184.25, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.18176504969596863, + "epoch": 0.10049019607843138, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0259646735635353, + "kl": 0.0018528061918914318, + "learning_rate": 3.292682926829268e-07, + "loss": 0.0875, + "num_tokens": 2496772.0, + "reward": -0.0625, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.766034722328186, + "sampling/importance_sampling_ratio/mean": 0.9997857213020325, + "sampling/importance_sampling_ratio/min": 0.5311629772186279, + "sampling/sampling_logp_difference/max": 0.6326863765716553, + "sampling/sampling_logp_difference/mean": 0.01508853118866682, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 192.34375, + "completions/mean_terminated_length": 192.34375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.18329501152038574, + "epoch": 0.1017156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.778822142019011, + "kl": 0.0023378990590572357, + "learning_rate": 3.333333333333333e-07, + "loss": 0.001, + "num_tokens": 2523226.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995636343955994, + "sampling/importance_sampling_ratio/min": 0.4812057614326477, + "sampling/sampling_logp_difference/max": 0.7314603328704834, + "sampling/sampling_logp_difference/mean": 0.015016846358776093, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 239.4375, + "completions/mean_terminated_length": 239.4375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.21985948085784912, + "epoch": 0.10294117647058823, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8214459082790309, + "kl": 0.0020694350823760033, + "learning_rate": 3.3739837398373985e-07, + "loss": -0.0622, + "num_tokens": 2560038.0, + "reward": -0.125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997300505638123, + "sampling/importance_sampling_ratio/min": 0.3163585066795349, + "sampling/sampling_logp_difference/max": 1.1508792638778687, + "sampling/sampling_logp_difference/mean": 0.014631749130785465, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 241.359375, + "completions/mean_terminated_length": 241.359375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2028217315673828, + "epoch": 0.10416666666666667, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2121606808342307, + "kl": 0.002727421699091792, + "learning_rate": 3.4146341463414634e-07, + "loss": 0.0238, + "num_tokens": 2597037.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.825548529624939, + "sampling/importance_sampling_ratio/mean": 0.9998873472213745, + "sampling/importance_sampling_ratio/min": 0.21376019716262817, + "sampling/sampling_logp_difference/max": 1.5429004430770874, + "sampling/sampling_logp_difference/mean": 0.015795081853866577, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 155.171875, + "completions/mean_terminated_length": 155.171875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.16458287835121155, + "epoch": 0.1053921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026919855382934496, + "kl": 0.0018414882943034172, + "learning_rate": 3.4552845528455284e-07, + "loss": 0.0, + "num_tokens": 2622520.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9431533813476562, + "sampling/importance_sampling_ratio/mean": 1.0006672143936157, + "sampling/importance_sampling_ratio/min": 0.42132270336151123, + "sampling/sampling_logp_difference/max": 0.8643562197685242, + "sampling/sampling_logp_difference/mean": 0.01315943244844675, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 273.34375, + "completions/mean_terminated_length": 273.34375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.23761431872844696, + "epoch": 0.10661764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.419811549150165, + "kl": 0.0020909798331558704, + "learning_rate": 3.4959349593495933e-07, + "loss": 0.021, + "num_tokens": 2657998.0, + "reward": 0.15625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002392530441284, + "sampling/importance_sampling_ratio/min": 0.4424218535423279, + "sampling/sampling_logp_difference/max": 0.8322451114654541, + "sampling/sampling_logp_difference/mean": 0.01570003293454647, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 233.125, + "completions/mean_terminated_length": 233.125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.14569368958473206, + "epoch": 0.10784313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9791468306525047, + "kl": 0.001645655371248722, + "learning_rate": 3.536585365853658e-07, + "loss": -0.0023, + "num_tokens": 2687702.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.9099587202072144, + "sampling/importance_sampling_ratio/mean": 1.00053071975708, + "sampling/importance_sampling_ratio/min": 0.29515644907951355, + "sampling/sampling_logp_difference/max": 1.2202496528625488, + "sampling/sampling_logp_difference/mean": 0.012327494099736214, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 206.796875, + "completions/mean_terminated_length": 206.796875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.13218270242214203, + "epoch": 0.1090686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.066458335616022, + "kl": 0.0034873075783252716, + "learning_rate": 3.5772357723577237e-07, + "loss": -0.0872, + "num_tokens": 2719641.0, + "reward": 0.125, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.9223304986953735, + "sampling/importance_sampling_ratio/mean": 0.998964786529541, + "sampling/importance_sampling_ratio/min": 0.24123281240463257, + "sampling/sampling_logp_difference/max": 1.4219927787780762, + "sampling/sampling_logp_difference/mean": 0.01427120715379715, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 236.03125, + "completions/mean_terminated_length": 236.03125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2090870440006256, + "epoch": 0.11029411764705882, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.4617604302787153, + "kl": 0.0022184555418789387, + "learning_rate": 3.6178861788617886e-07, + "loss": 0.0033, + "num_tokens": 2753547.0, + "reward": 0.40625, + "reward_std": 0.6046693325042725, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993847608566284, + "sampling/importance_sampling_ratio/min": 0.07257959991693497, + "sampling/sampling_logp_difference/max": 2.6230714321136475, + "sampling/sampling_logp_difference/mean": 0.015504513867199421, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 203.328125, + "completions/mean_terminated_length": 203.328125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.21814948320388794, + "epoch": 0.11151960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3855837266028832, + "kl": 0.003080436959862709, + "learning_rate": 3.6585365853658536e-07, + "loss": 0.036, + "num_tokens": 2785984.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.697697401046753, + "sampling/importance_sampling_ratio/mean": 1.000431776046753, + "sampling/importance_sampling_ratio/min": 0.5824047923088074, + "sampling/sampling_logp_difference/max": 0.5405895709991455, + "sampling/sampling_logp_difference/mean": 0.01569124311208725, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 157.9375, + "completions/mean_terminated_length": 157.9375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.15251809358596802, + "epoch": 0.11274509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3646620053083798, + "kl": 0.0027233543805778027, + "learning_rate": 3.6991869918699185e-07, + "loss": -0.0658, + "num_tokens": 2809436.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.9242669343948364, + "sampling/importance_sampling_ratio/mean": 1.0009275674819946, + "sampling/importance_sampling_ratio/min": 0.3941422700881958, + "sampling/sampling_logp_difference/max": 0.9310433864593506, + "sampling/sampling_logp_difference/mean": 0.011663028970360756, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 248.28125, + "completions/mean_terminated_length": 248.28125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.20643439888954163, + "epoch": 0.11397058823529412, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.231436724426304, + "kl": 0.003451679367572069, + "learning_rate": 3.7398373983739835e-07, + "loss": -0.046, + "num_tokens": 2844286.0, + "reward": 0.46875, + "reward_std": 0.8837460875511169, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000553131103516, + "sampling/importance_sampling_ratio/min": 0.29556503891944885, + "sampling/sampling_logp_difference/max": 1.2188663482666016, + "sampling/sampling_logp_difference/mean": 0.015196739695966244, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 152.84375, + "completions/mean_terminated_length": 152.84375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.14843550324440002, + "epoch": 0.11519607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.5282982854898655, + "kl": 0.002533209975808859, + "learning_rate": 3.7804878048780484e-07, + "loss": 0.0155, + "num_tokens": 2867652.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.7544697523117065, + "sampling/importance_sampling_ratio/mean": 1.0003670454025269, + "sampling/importance_sampling_ratio/min": 0.5090111494064331, + "sampling/sampling_logp_difference/max": 0.6752853393554688, + "sampling/sampling_logp_difference/mean": 0.012290108948946, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 252.328125, + "completions/mean_terminated_length": 252.328125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.21648448705673218, + "epoch": 0.11642156862745098, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1210581745746024, + "kl": 0.00387467909604311, + "learning_rate": 3.821138211382114e-07, + "loss": -0.0213, + "num_tokens": 2902793.0, + "reward": 0.34375, + "reward_std": 0.747555673122406, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996301531791687, + "sampling/importance_sampling_ratio/min": 0.4053426682949066, + "sampling/sampling_logp_difference/max": 1.3121697902679443, + "sampling/sampling_logp_difference/mean": 0.015475263819098473, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 142.421875, + "completions/mean_terminated_length": 142.421875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.2054484337568283, + "epoch": 0.11764705882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.2365132525166236, + "kl": 0.00443034153431654, + "learning_rate": 3.861788617886179e-07, + "loss": -0.0237, + "num_tokens": 2932836.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.873075246810913, + "sampling/importance_sampling_ratio/mean": 0.9996028542518616, + "sampling/importance_sampling_ratio/min": 0.11423374712467194, + "sampling/sampling_logp_difference/max": 2.169508457183838, + "sampling/sampling_logp_difference/mean": 0.017275255173444748, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 185.5625, + "completions/mean_terminated_length": 185.5625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.1749388575553894, + "epoch": 0.11887254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7714766396209334, + "kl": 0.002680524019524455, + "learning_rate": 3.902439024390244e-07, + "loss": -0.0616, + "num_tokens": 2961608.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999269425868988, + "sampling/importance_sampling_ratio/min": 0.495414674282074, + "sampling/sampling_logp_difference/max": 0.7522246837615967, + "sampling/sampling_logp_difference/mean": 0.013655086979269981, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 152.25, + "completions/mean_terminated_length": 152.25, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.22369414567947388, + "epoch": 0.12009803921568628, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3642297674506656, + "kl": 0.004713735543191433, + "learning_rate": 3.9430894308943087e-07, + "loss": -0.039, + "num_tokens": 2989848.0, + "reward": 0.4375, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001537799835205, + "sampling/importance_sampling_ratio/min": 0.06123197078704834, + "sampling/sampling_logp_difference/max": 2.793085813522339, + "sampling/sampling_logp_difference/mean": 0.016208482906222343, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 190.171875, + "completions/mean_terminated_length": 190.171875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.184893399477005, + "epoch": 0.1213235294117647, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2527448488608326, + "kl": 0.002585100941359997, + "learning_rate": 3.9837398373983736e-07, + "loss": 0.0081, + "num_tokens": 3018771.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.7099417448043823, + "sampling/importance_sampling_ratio/mean": 0.999454140663147, + "sampling/importance_sampling_ratio/min": 0.38532912731170654, + "sampling/sampling_logp_difference/max": 0.9536573886871338, + "sampling/sampling_logp_difference/mean": 0.016417233273386955, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 237.203125, + "completions/mean_terminated_length": 237.203125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.22887080907821655, + "epoch": 0.12254901960784313, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7804555425711541, + "kl": 0.0029146044980734587, + "learning_rate": 4.024390243902439e-07, + "loss": 0.0017, + "num_tokens": 3053056.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001249313354492, + "sampling/importance_sampling_ratio/min": 0.49485504627227783, + "sampling/sampling_logp_difference/max": 0.8862013816833496, + "sampling/sampling_logp_difference/mean": 0.01566164568066597, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 204.46875, + "completions/mean_terminated_length": 204.46875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2525796890258789, + "epoch": 0.12377450980392157, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.717185626522858, + "kl": 0.003638624679297209, + "learning_rate": 4.065040650406504e-07, + "loss": 0.0101, + "num_tokens": 3085022.0, + "reward": 0.5625, + "reward_std": 0.5765564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995367527008057, + "sampling/importance_sampling_ratio/min": 0.5260770916938782, + "sampling/sampling_logp_difference/max": 0.8979051113128662, + "sampling/sampling_logp_difference/mean": 0.018047038465738297, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 5000.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 270.453125, + "completions/mean_terminated_length": 195.38096618652344, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.1673162579536438, + "epoch": 0.125, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6617256672581135, + "kl": 0.003493118565529585, + "learning_rate": 4.105691056910569e-07, + "loss": 0.5592, + "num_tokens": 3121419.0, + "reward": -0.125, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996997117996216, + "sampling/importance_sampling_ratio/min": 0.1413872390985489, + "sampling/sampling_logp_difference/max": 1.9562528133392334, + "sampling/sampling_logp_difference/mean": 0.01538526639342308, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 135.90625, + "completions/mean_terminated_length": 135.90625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.19178907573223114, + "epoch": 0.12622549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.9304998229222012, + "kl": 0.004775335546582937, + "learning_rate": 4.146341463414634e-07, + "loss": 0.0087, + "num_tokens": 3150517.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.8355188369750977, + "sampling/importance_sampling_ratio/mean": 1.0008076429367065, + "sampling/importance_sampling_ratio/min": 0.40564560890197754, + "sampling/sampling_logp_difference/max": 0.9022754430770874, + "sampling/sampling_logp_difference/mean": 0.0178519319742918, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 196.859375, + "completions/mean_terminated_length": 196.859375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.21618320047855377, + "epoch": 0.12745098039215685, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.7597974618964067, + "kl": 0.002705794293433428, + "learning_rate": 4.186991869918699e-07, + "loss": 0.1055, + "num_tokens": 3181788.0, + "reward": 0.375, + "reward_std": 0.5651718378067017, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001839399337769, + "sampling/importance_sampling_ratio/min": 0.23663082718849182, + "sampling/sampling_logp_difference/max": 1.4412540197372437, + "sampling/sampling_logp_difference/mean": 0.016408627852797508, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 259.0625, + "completions/mean_terminated_length": 259.0625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.19014139473438263, + "epoch": 0.12867647058823528, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5398076256891096, + "kl": 0.0030213571153581142, + "learning_rate": 4.2276422764227643e-07, + "loss": 0.0671, + "num_tokens": 3217744.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.9693632125854492, + "sampling/importance_sampling_ratio/mean": 1.0007412433624268, + "sampling/importance_sampling_ratio/min": 0.3723587989807129, + "sampling/sampling_logp_difference/max": 0.9878973960876465, + "sampling/sampling_logp_difference/mean": 0.014151263982057571, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 151.921875, + "completions/mean_terminated_length": 151.921875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.17414289712905884, + "epoch": 0.12990196078431374, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9964054092101362, + "kl": 0.0042853644117712975, + "learning_rate": 4.268292682926829e-07, + "loss": 0.0603, + "num_tokens": 3242923.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5903403759002686, + "sampling/importance_sampling_ratio/mean": 1.0001468658447266, + "sampling/importance_sampling_ratio/min": 0.20117127895355225, + "sampling/sampling_logp_difference/max": 1.6035985946655273, + "sampling/sampling_logp_difference/mean": 0.016937680542469025, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 243.296875, + "completions/mean_terminated_length": 243.296875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.14538052678108215, + "epoch": 0.13112745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.776854554423682, + "kl": 0.0027929027564823627, + "learning_rate": 4.308943089430894e-07, + "loss": 0.0225, + "num_tokens": 3283790.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.43359288573265076, + "sampling/sampling_logp_difference/max": 1.1120898723602295, + "sampling/sampling_logp_difference/mean": 0.012797070667147636, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 181.75, + "completions/mean_terminated_length": 181.75, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.1644519865512848, + "epoch": 0.1323529411764706, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.0964938129632356, + "kl": 0.003369982587173581, + "learning_rate": 4.349593495934959e-07, + "loss": 0.0225, + "num_tokens": 3321534.0, + "reward": 0.40625, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.7989542484283447, + "sampling/importance_sampling_ratio/mean": 0.9994832277297974, + "sampling/importance_sampling_ratio/min": 0.027374885976314545, + "sampling/sampling_logp_difference/max": 3.5981292724609375, + "sampling/sampling_logp_difference/mean": 0.013361322693526745, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 256.609375, + "completions/mean_terminated_length": 256.609375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.20587630569934845, + "epoch": 0.13357843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5257994074267291, + "kl": 0.0031718644313514233, + "learning_rate": 4.390243902439024e-07, + "loss": 0.0223, + "num_tokens": 3362229.0, + "reward": 0.59375, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002754926681519, + "sampling/importance_sampling_ratio/min": 0.011834507808089256, + "sampling/sampling_logp_difference/max": 4.4367356300354, + "sampling/sampling_logp_difference/mean": 0.01595362275838852, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 229.296875, + "completions/mean_terminated_length": 229.296875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.15049073100090027, + "epoch": 0.13480392156862744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024043545684525217, + "kl": 0.0024401461705565453, + "learning_rate": 4.4308943089430896e-07, + "loss": 0.0, + "num_tokens": 3404056.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8615195751190186, + "sampling/importance_sampling_ratio/mean": 0.9994591474533081, + "sampling/importance_sampling_ratio/min": 0.1321420669555664, + "sampling/sampling_logp_difference/max": 2.0238776206970215, + "sampling/sampling_logp_difference/mean": 0.011990568600594997, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 140.734375, + "completions/mean_terminated_length": 140.734375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.15258745849132538, + "epoch": 0.13602941176470587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036371133195659794, + "kl": 0.0029322488699108362, + "learning_rate": 4.471544715447154e-07, + "loss": 0.0, + "num_tokens": 3427655.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6232942342758179, + "sampling/importance_sampling_ratio/mean": 0.9999867081642151, + "sampling/importance_sampling_ratio/min": 0.5510865449905396, + "sampling/sampling_logp_difference/max": 0.5958633422851562, + "sampling/sampling_logp_difference/mean": 0.012529904022812843, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 268.984375, + "completions/mean_terminated_length": 268.984375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.24910208582878113, + "epoch": 0.13725490196078433, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9207390396724677, + "kl": 0.0038540945388376713, + "learning_rate": 4.5121951219512194e-07, + "loss": -0.1523, + "num_tokens": 3460934.0, + "reward": 0.25, + "reward_std": 0.6813369989395142, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997692108154297, + "sampling/importance_sampling_ratio/min": 0.22504088282585144, + "sampling/sampling_logp_difference/max": 1.4914731979370117, + "sampling/sampling_logp_difference/mean": 0.015616269782185555, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 185.921875, + "completions/mean_terminated_length": 185.921875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.1516919881105423, + "epoch": 0.13848039215686275, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5625968120291285, + "kl": 0.0030631846748292446, + "learning_rate": 4.5528455284552844e-07, + "loss": 0.0062, + "num_tokens": 3494769.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002806186676025, + "sampling/importance_sampling_ratio/min": 0.509614884853363, + "sampling/sampling_logp_difference/max": 0.6986632347106934, + "sampling/sampling_logp_difference/mean": 0.012877561151981354, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 175.953125, + "completions/mean_terminated_length": 175.953125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.20463861525058746, + "epoch": 0.13970588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6959729876683496, + "kl": 0.0027167543303221464, + "learning_rate": 4.5934959349593493e-07, + "loss": 0.0068, + "num_tokens": 3523326.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.8048852682113647, + "sampling/importance_sampling_ratio/mean": 0.9999552369117737, + "sampling/importance_sampling_ratio/min": 0.36874687671661377, + "sampling/sampling_logp_difference/max": 0.9976449012756348, + "sampling/sampling_logp_difference/mean": 0.015765059739351273, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 176.15625, + "completions/mean_terminated_length": 176.15625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.11405276507139206, + "epoch": 0.1409313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028435623016476855, + "kl": 0.002391491550952196, + "learning_rate": 4.634146341463415e-07, + "loss": 0.0, + "num_tokens": 3550072.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.989517092704773, + "sampling/importance_sampling_ratio/mean": 0.9999762177467346, + "sampling/importance_sampling_ratio/min": 0.4801751971244812, + "sampling/sampling_logp_difference/max": 0.7336042523384094, + "sampling/sampling_logp_difference/mean": 0.0115530239418149, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 175.21875, + "completions/mean_terminated_length": 175.21875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.18383879959583282, + "epoch": 0.14215686274509803, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.926756574189135, + "kl": 0.004470705054700375, + "learning_rate": 4.674796747967479e-07, + "loss": -0.0008, + "num_tokens": 3584502.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006392002105713, + "sampling/importance_sampling_ratio/min": 0.3582732081413269, + "sampling/sampling_logp_difference/max": 1.1886632442474365, + "sampling/sampling_logp_difference/mean": 0.01593351922929287, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 333.65625, + "completions/mean_terminated_length": 333.65625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.13514362275600433, + "epoch": 0.14338235294117646, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4031890143594556, + "kl": 0.0020420460496097803, + "learning_rate": 4.7154471544715447e-07, + "loss": -0.0866, + "num_tokens": 3624608.0, + "reward": 0.0625, + "reward_std": 0.617996096611023, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.9954233169555664, + "sampling/importance_sampling_ratio/mean": 1.0003677606582642, + "sampling/importance_sampling_ratio/min": 0.4747653305530548, + "sampling/sampling_logp_difference/max": 0.7449345588684082, + "sampling/sampling_logp_difference/mean": 0.00980311818420887, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 246.296875, + "completions/mean_terminated_length": 246.296875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.1611090898513794, + "epoch": 0.14460784313725492, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4060303837792323, + "kl": 0.0024019847624003887, + "learning_rate": 4.756097560975609e-07, + "loss": -0.0066, + "num_tokens": 3659251.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008327960968018, + "sampling/importance_sampling_ratio/min": 0.2378438413143158, + "sampling/sampling_logp_difference/max": 1.436141014099121, + "sampling/sampling_logp_difference/mean": 0.012749040499329567, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 209.328125, + "completions/mean_terminated_length": 209.328125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.2315022349357605, + "epoch": 0.14583333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9311765050475516, + "kl": 0.004573897924274206, + "learning_rate": 4.796747967479675e-07, + "loss": -0.0414, + "num_tokens": 3686840.0, + "reward": -0.65625, + "reward_std": 0.7366957664489746, + "rewards/decision_reward_func/mean": -0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003242492675781, + "sampling/importance_sampling_ratio/min": 0.4618137776851654, + "sampling/sampling_logp_difference/max": 1.0040912628173828, + "sampling/sampling_logp_difference/mean": 0.01711522415280342, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 193.453125, + "completions/mean_terminated_length": 193.453125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.17183716595172882, + "epoch": 0.14705882352941177, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.128245508696429, + "kl": 0.0038222374860197306, + "learning_rate": 4.83739837398374e-07, + "loss": 0.0195, + "num_tokens": 3714709.0, + "reward": 0.53125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004196166992188, + "sampling/importance_sampling_ratio/min": 0.11990854144096375, + "sampling/sampling_logp_difference/max": 2.121026039123535, + "sampling/sampling_logp_difference/mean": 0.013609963469207287, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 242.46875, + "completions/mean_terminated_length": 242.46875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2437196969985962, + "epoch": 0.1482843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4598078819366105, + "kl": 0.002234832150861621, + "learning_rate": 4.878048780487804e-07, + "loss": -0.0034, + "num_tokens": 3753411.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.7094210386276245, + "sampling/importance_sampling_ratio/mean": 0.9994633197784424, + "sampling/importance_sampling_ratio/min": 0.22669436037540436, + "sampling/sampling_logp_difference/max": 1.4841525554656982, + "sampling/sampling_logp_difference/mean": 0.017895739525556564, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 207.90625, + "completions/mean_terminated_length": 207.90625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.20850908756256104, + "epoch": 0.14950980392156862, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5981813416819735, + "kl": 0.0032731422688812017, + "learning_rate": 4.91869918699187e-07, + "loss": -0.0103, + "num_tokens": 3786877.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5795906782150269, + "sampling/importance_sampling_ratio/mean": 0.999427080154419, + "sampling/importance_sampling_ratio/min": 0.3412606716156006, + "sampling/sampling_logp_difference/max": 1.0751086473464966, + "sampling/sampling_logp_difference/mean": 0.014012198895215988, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 158.421875, + "completions/mean_terminated_length": 158.421875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.1915990114212036, + "epoch": 0.15073529411764705, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.2374537936724637, + "kl": 0.003779832972213626, + "learning_rate": 4.959349593495934e-07, + "loss": 0.0142, + "num_tokens": 3810856.0, + "reward": 0.5, + "reward_std": 0.5879635810852051, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.86378812789917, + "sampling/importance_sampling_ratio/mean": 0.9994109272956848, + "sampling/importance_sampling_ratio/min": 0.459695428609848, + "sampling/sampling_logp_difference/max": 0.777191162109375, + "sampling/sampling_logp_difference/mean": 0.015889719128608704, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 324.296875, + "completions/mean_terminated_length": 324.296875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.19484072923660278, + "epoch": 0.15196078431372548, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.837653411054766, + "kl": 0.0026376841124147177, + "learning_rate": 5e-07, + "loss": -0.0623, + "num_tokens": 3859771.0, + "reward": 0.15625, + "reward_std": 0.519389271736145, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995371699333191, + "sampling/importance_sampling_ratio/min": 0.3069584369659424, + "sampling/sampling_logp_difference/max": 1.1810429096221924, + "sampling/sampling_logp_difference/mean": 0.013854834251105785, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 165.09375, + "completions/mean_terminated_length": 165.09375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.17323510348796844, + "epoch": 0.15318627450980393, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5482149381120744, + "kl": 0.0043769716285169125, + "learning_rate": 5.040650406504064e-07, + "loss": 0.0973, + "num_tokens": 3889521.0, + "reward": 0.46875, + "reward_std": 0.46656501293182373, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.612404704093933, + "sampling/importance_sampling_ratio/mean": 0.9999923706054688, + "sampling/importance_sampling_ratio/min": 0.4086287319660187, + "sampling/sampling_logp_difference/max": 0.8949482440948486, + "sampling/sampling_logp_difference/mean": 0.013625801540911198, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 217.21875, + "completions/mean_terminated_length": 217.21875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.24082908034324646, + "epoch": 0.15441176470588236, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0962555888788272, + "kl": 0.006179064512252808, + "learning_rate": 5.081300813008131e-07, + "loss": 0.0102, + "num_tokens": 3920687.0, + "reward": 0.53125, + "reward_std": 0.6970869898796082, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999029040336609, + "sampling/importance_sampling_ratio/min": 0.4000912308692932, + "sampling/sampling_logp_difference/max": 0.9160627126693726, + "sampling/sampling_logp_difference/mean": 0.01595935970544815, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 148.53125, + "completions/mean_terminated_length": 148.53125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.16705353558063507, + "epoch": 0.1556372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.3979375953032402, + "kl": 0.004339561332017183, + "learning_rate": 5.121951219512195e-07, + "loss": -0.0035, + "num_tokens": 3945777.0, + "reward": -0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6513288021087646, + "sampling/importance_sampling_ratio/mean": 0.999443769454956, + "sampling/importance_sampling_ratio/min": 0.4581097364425659, + "sampling/sampling_logp_difference/max": 0.780646562576294, + "sampling/sampling_logp_difference/mean": 0.014140298590064049, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 148.28125, + "completions/mean_terminated_length": 148.28125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.20434808731079102, + "epoch": 0.1568627450980392, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.053304404963926, + "kl": 0.006051310338079929, + "learning_rate": 5.16260162601626e-07, + "loss": 0.0431, + "num_tokens": 3971763.0, + "reward": 0.5, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995803833007812, + "sampling/importance_sampling_ratio/min": 0.2824687957763672, + "sampling/sampling_logp_difference/max": 1.2641870975494385, + "sampling/sampling_logp_difference/mean": 0.015509507618844509, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 259.71875, + "completions/mean_terminated_length": 259.71875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.17522120475769043, + "epoch": 0.15808823529411764, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1036992710076245, + "kl": 0.0062559181824326515, + "learning_rate": 5.203252032520325e-07, + "loss": 0.0781, + "num_tokens": 4006929.0, + "reward": 0.5625, + "reward_std": 0.6663130521774292, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00052809715271, + "sampling/importance_sampling_ratio/min": 0.20834197103977203, + "sampling/sampling_logp_difference/max": 1.5685744285583496, + "sampling/sampling_logp_difference/mean": 0.013994507491588593, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 207.625, + "completions/mean_terminated_length": 207.625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.22118355333805084, + "epoch": 0.15931372549019607, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5144738581732677, + "kl": 0.006246750243008137, + "learning_rate": 5.24390243902439e-07, + "loss": -0.0558, + "num_tokens": 4036873.0, + "reward": 0.375, + "reward_std": 0.6267197132110596, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.3516245484352112, + "sampling/sampling_logp_difference/max": 1.0451912879943848, + "sampling/sampling_logp_difference/mean": 0.014911655336618423, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 153.0, + "completions/mean_terminated_length": 153.0, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.13152629137039185, + "epoch": 0.16053921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03046928461366396, + "kl": 0.0037722119595855474, + "learning_rate": 5.284552845528455e-07, + "loss": 0.0, + "num_tokens": 4065097.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000600814819336, + "sampling/importance_sampling_ratio/min": 0.11419089883565903, + "sampling/sampling_logp_difference/max": 2.1698837280273438, + "sampling/sampling_logp_difference/mean": 0.011482913978397846, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 192.8125, + "completions/mean_terminated_length": 192.8125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.14755786955356598, + "epoch": 0.16176470588235295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03286335625323049, + "kl": 0.004532915540039539, + "learning_rate": 5.325203252032519e-07, + "loss": 0.0, + "num_tokens": 4094413.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997543096542358, + "sampling/importance_sampling_ratio/min": 0.09504194557666779, + "sampling/sampling_logp_difference/max": 2.3534369468688965, + "sampling/sampling_logp_difference/mean": 0.01464562863111496, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 164.078125, + "completions/mean_terminated_length": 164.078125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.1908288598060608, + "epoch": 0.16299019607843138, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0801893591822433, + "kl": 0.005190457217395306, + "learning_rate": 5.365853658536586e-07, + "loss": 0.0327, + "num_tokens": 4124530.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999281167984009, + "sampling/importance_sampling_ratio/min": 0.30315646529197693, + "sampling/sampling_logp_difference/max": 1.3454761505126953, + "sampling/sampling_logp_difference/mean": 0.018591245636343956, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 200.234375, + "completions/mean_terminated_length": 200.234375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.20808929204940796, + "epoch": 0.1642156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2169722191860628, + "kl": 0.009982358664274216, + "learning_rate": 5.40650406504065e-07, + "loss": -0.0116, + "num_tokens": 4153425.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002822875976562, + "sampling/importance_sampling_ratio/min": 0.5256777405738831, + "sampling/sampling_logp_difference/max": 0.8757648468017578, + "sampling/sampling_logp_difference/mean": 0.014309515245258808, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 214.40625, + "completions/mean_terminated_length": 214.40625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2324899584054947, + "epoch": 0.16544117647058823, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.679416954022788, + "kl": 0.012347174808382988, + "learning_rate": 5.447154471544715e-07, + "loss": -0.0228, + "num_tokens": 4189371.0, + "reward": 0.75, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000662803649902, + "sampling/importance_sampling_ratio/min": 0.41598430275917053, + "sampling/sampling_logp_difference/max": 0.8771077394485474, + "sampling/sampling_logp_difference/mean": 0.01692269928753376, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 215.828125, + "completions/mean_terminated_length": 215.828125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.2709134817123413, + "epoch": 0.16666666666666666, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2187073874644576, + "kl": 0.007469979114830494, + "learning_rate": 5.487804878048781e-07, + "loss": -0.0014, + "num_tokens": 4225440.0, + "reward": 0.3125, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.7517609596252441, + "sampling/importance_sampling_ratio/mean": 1.0001893043518066, + "sampling/importance_sampling_ratio/min": 0.4335139989852905, + "sampling/sampling_logp_difference/max": 0.8358311653137207, + "sampling/sampling_logp_difference/mean": 0.018575577065348625, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 219.53125, + "completions/mean_terminated_length": 219.53125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.18905353546142578, + "epoch": 0.16789215686274508, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5848390274616682, + "kl": 0.0062895650044083595, + "learning_rate": 5.528455284552846e-07, + "loss": 0.0301, + "num_tokens": 4256034.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.7029181718826294, + "sampling/importance_sampling_ratio/mean": 1.0001823902130127, + "sampling/importance_sampling_ratio/min": 0.5002944469451904, + "sampling/sampling_logp_difference/max": 0.6925585269927979, + "sampling/sampling_logp_difference/mean": 0.012864059768617153, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.21458560228347778, + "epoch": 0.16911764705882354, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8899681943241897, + "kl": 0.007482388522475958, + "learning_rate": 5.56910569105691e-07, + "loss": 0.0397, + "num_tokens": 4285738.0, + "reward": 0.125, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.9538545608520508, + "sampling/importance_sampling_ratio/mean": 1.0000500679016113, + "sampling/importance_sampling_ratio/min": 0.2822778820991516, + "sampling/sampling_logp_difference/max": 1.2648632526397705, + "sampling/sampling_logp_difference/mean": 0.01581709086894989, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 211.796875, + "completions/mean_terminated_length": 211.796875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2663291096687317, + "epoch": 0.17034313725490197, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.811200263563688, + "kl": 0.006997620686888695, + "learning_rate": 5.609756097560975e-07, + "loss": 0.0245, + "num_tokens": 4316541.0, + "reward": 0.59375, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.7874058485031128, + "sampling/importance_sampling_ratio/mean": 1.0008964538574219, + "sampling/importance_sampling_ratio/min": 0.49483078718185425, + "sampling/sampling_logp_difference/max": 0.7035393714904785, + "sampling/sampling_logp_difference/mean": 0.017890911549329758, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 205.03125, + "completions/mean_terminated_length": 205.03125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.20182162523269653, + "epoch": 0.1715686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034780777827098576, + "kl": 0.004200125113129616, + "learning_rate": 5.650406504065041e-07, + "loss": 0.0, + "num_tokens": 4349471.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000801682472229, + "sampling/importance_sampling_ratio/min": 0.46579307317733765, + "sampling/sampling_logp_difference/max": 0.8023383617401123, + "sampling/sampling_logp_difference/mean": 0.015425757504999638, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 155.640625, + "completions/mean_terminated_length": 155.640625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.15710008144378662, + "epoch": 0.17279411764705882, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2324288770488674, + "kl": 0.009632674977183342, + "learning_rate": 5.691056910569105e-07, + "loss": 0.0051, + "num_tokens": 4374408.0, + "reward": 0.40625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004931688308716, + "sampling/importance_sampling_ratio/min": 0.37433290481567383, + "sampling/sampling_logp_difference/max": 0.982609748840332, + "sampling/sampling_logp_difference/mean": 0.012852296233177185, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 168.21875, + "completions/mean_terminated_length": 168.21875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.18867620825767517, + "epoch": 0.17401960784313725, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3049256000371254, + "kl": 0.006957865320146084, + "learning_rate": 5.73170731707317e-07, + "loss": 0.0155, + "num_tokens": 4402886.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9916741847991943, + "sampling/importance_sampling_ratio/mean": 1.0004972219467163, + "sampling/importance_sampling_ratio/min": 0.36625033617019653, + "sampling/sampling_logp_difference/max": 1.0044382810592651, + "sampling/sampling_logp_difference/mean": 0.01673087105154991, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 158.5625, + "completions/mean_terminated_length": 158.5625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.19338209927082062, + "epoch": 0.17524509803921567, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.986566813860805, + "kl": 0.006531933322548866, + "learning_rate": 5.772357723577236e-07, + "loss": 0.0225, + "num_tokens": 4427706.0, + "reward": 0.34375, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997323751449585, + "sampling/importance_sampling_ratio/min": 0.38433632254600525, + "sampling/sampling_logp_difference/max": 0.9562373161315918, + "sampling/sampling_logp_difference/mean": 0.01588144153356552, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 161.203125, + "completions/mean_terminated_length": 161.203125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.27450457215309143, + "epoch": 0.17647058823529413, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.059225107853198, + "kl": 0.014421003870666027, + "learning_rate": 5.813008130081301e-07, + "loss": 0.0124, + "num_tokens": 4465031.0, + "reward": 0.21875, + "reward_std": 0.747555673122406, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994995594024658, + "sampling/importance_sampling_ratio/min": 0.27133044600486755, + "sampling/sampling_logp_difference/max": 1.3044178485870361, + "sampling/sampling_logp_difference/mean": 0.02083595097064972, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 144.671875, + "completions/mean_terminated_length": 144.671875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.18501253426074982, + "epoch": 0.17769607843137256, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8337965737119062, + "kl": 0.008115117438137531, + "learning_rate": 5.853658536585365e-07, + "loss": -0.0146, + "num_tokens": 4491730.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.7520533800125122, + "sampling/importance_sampling_ratio/mean": 1.0005688667297363, + "sampling/importance_sampling_ratio/min": 0.4150790870189667, + "sampling/sampling_logp_difference/max": 0.8792862892150879, + "sampling/sampling_logp_difference/mean": 0.015096020884811878, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 206.53125, + "completions/mean_terminated_length": 206.53125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.19073179364204407, + "epoch": 0.17892156862745098, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3074569787892356, + "kl": 0.009971046820282936, + "learning_rate": 5.894308943089431e-07, + "loss": 0.0561, + "num_tokens": 4527908.0, + "reward": 0.625, + "reward_std": 0.6285127401351929, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000136137008667, + "sampling/importance_sampling_ratio/min": 0.4788450598716736, + "sampling/sampling_logp_difference/max": 0.807410478591919, + "sampling/sampling_logp_difference/mean": 0.014115612953901291, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 208.4375, + "completions/mean_terminated_length": 208.4375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.19065698981285095, + "epoch": 0.1801470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1376428733901416, + "kl": 0.003937162458896637, + "learning_rate": 5.934959349593496e-07, + "loss": -0.0116, + "num_tokens": 4556816.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.5901724100112915, + "sampling/importance_sampling_ratio/mean": 0.9997545480728149, + "sampling/importance_sampling_ratio/min": 0.08214754611253738, + "sampling/sampling_logp_difference/max": 2.4992382526397705, + "sampling/sampling_logp_difference/mean": 0.012619711458683014, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 207.234375, + "completions/mean_terminated_length": 207.234375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.23524893820285797, + "epoch": 0.18137254901960784, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.2215328161770977, + "kl": 0.00898287259042263, + "learning_rate": 5.97560975609756e-07, + "loss": 0.0794, + "num_tokens": 4588735.0, + "reward": 0.59375, + "reward_std": 0.5457825064659119, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.9876911640167236, + "sampling/importance_sampling_ratio/mean": 1.0000414848327637, + "sampling/importance_sampling_ratio/min": 0.3859356641769409, + "sampling/sampling_logp_difference/max": 0.9520845413208008, + "sampling/sampling_logp_difference/mean": 0.018174968659877777, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 146.453125, + "completions/mean_terminated_length": 146.453125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.1839636117219925, + "epoch": 0.18259803921568626, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.870807509522527, + "kl": 0.01568181812763214, + "learning_rate": 6.016260162601626e-07, + "loss": 0.0299, + "num_tokens": 4617836.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007880926132202, + "sampling/importance_sampling_ratio/min": 0.3016190528869629, + "sampling/sampling_logp_difference/max": 1.1985905170440674, + "sampling/sampling_logp_difference/mean": 0.01622135564684868, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 195.6875, + "completions/mean_terminated_length": 195.6875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2160109132528305, + "epoch": 0.18382352941176472, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1448145492871644, + "kl": 0.006461469456553459, + "learning_rate": 6.056910569105691e-07, + "loss": -0.007, + "num_tokens": 4645160.0, + "reward": 0.53125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000572204589844, + "sampling/importance_sampling_ratio/min": 0.23663082718849182, + "sampling/sampling_logp_difference/max": 1.4412540197372437, + "sampling/sampling_logp_difference/mean": 0.01504783146083355, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 192.171875, + "completions/mean_terminated_length": 192.171875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.28349563479423523, + "epoch": 0.18504901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8813112982324198, + "kl": 0.008609576150774956, + "learning_rate": 6.097560975609756e-07, + "loss": -0.0038, + "num_tokens": 4679859.0, + "reward": 0.6875, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.7709717750549316, + "sampling/importance_sampling_ratio/mean": 1.0000584125518799, + "sampling/importance_sampling_ratio/min": 0.5407850742340088, + "sampling/sampling_logp_difference/max": 0.6147333383560181, + "sampling/sampling_logp_difference/mean": 0.01752694509923458, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 178.296875, + "completions/mean_terminated_length": 178.296875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.1929841935634613, + "epoch": 0.18627450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.285064208466724, + "kl": 0.009654335677623749, + "learning_rate": 6.13821138211382e-07, + "loss": 0.0015, + "num_tokens": 4711766.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000225305557251, + "sampling/importance_sampling_ratio/min": 0.5676493048667908, + "sampling/sampling_logp_difference/max": 0.7288963794708252, + "sampling/sampling_logp_difference/mean": 0.014351408928632736, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 159.8125, + "completions/mean_terminated_length": 159.8125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.18735501170158386, + "epoch": 0.1875, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6921133600368123, + "kl": 0.010966267436742783, + "learning_rate": 6.178861788617887e-07, + "loss": -0.0191, + "num_tokens": 4738634.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.751664400100708, + "sampling/importance_sampling_ratio/mean": 1.0007822513580322, + "sampling/importance_sampling_ratio/min": 0.3664078414440155, + "sampling/sampling_logp_difference/max": 1.0040082931518555, + "sampling/sampling_logp_difference/mean": 0.014257569797337055, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 215.921875, + "completions/mean_terminated_length": 215.921875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2175692319869995, + "epoch": 0.18872549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0600521345041236, + "kl": 0.008125945925712585, + "learning_rate": 6.219512195121951e-07, + "loss": -0.0697, + "num_tokens": 4772085.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993202090263367, + "sampling/importance_sampling_ratio/min": 0.23123972117900848, + "sampling/sampling_logp_difference/max": 1.4643003940582275, + "sampling/sampling_logp_difference/mean": 0.01601422019302845, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 166.75, + "completions/mean_terminated_length": 166.75, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.21947787702083588, + "epoch": 0.18995098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.5007272914259344, + "kl": 0.014036942273378372, + "learning_rate": 6.260162601626016e-07, + "loss": 0.0502, + "num_tokens": 4798853.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994269609451294, + "sampling/importance_sampling_ratio/min": 0.48846733570098877, + "sampling/sampling_logp_difference/max": 0.9443447589874268, + "sampling/sampling_logp_difference/mean": 0.016575731337070465, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 164.03125, + "completions/mean_terminated_length": 164.03125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.20363306999206543, + "epoch": 0.19117647058823528, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9863150545331905, + "kl": 0.01705070771276951, + "learning_rate": 6.300813008130081e-07, + "loss": -0.0159, + "num_tokens": 4826183.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000278949737549, + "sampling/importance_sampling_ratio/min": 0.2822778820991516, + "sampling/sampling_logp_difference/max": 1.2648632526397705, + "sampling/sampling_logp_difference/mean": 0.014696292579174042, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 150.5625, + "completions/mean_terminated_length": 150.5625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.16793668270111084, + "epoch": 0.19240196078431374, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8347136815339293, + "kl": 0.013624644838273525, + "learning_rate": 6.341463414634146e-07, + "loss": -0.001, + "num_tokens": 4856155.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000171661376953, + "sampling/importance_sampling_ratio/min": 0.3773249685764313, + "sampling/sampling_logp_difference/max": 0.9746484756469727, + "sampling/sampling_logp_difference/mean": 0.015161644667387009, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 164.640625, + "completions/mean_terminated_length": 164.640625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.19261035323143005, + "epoch": 0.19362745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0138578712957336, + "kl": 0.0075540426187217236, + "learning_rate": 6.382113821138211e-07, + "loss": 0.0021, + "num_tokens": 4881396.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.9841989278793335, + "sampling/importance_sampling_ratio/mean": 1.0002670288085938, + "sampling/importance_sampling_ratio/min": 0.5383151769638062, + "sampling/sampling_logp_difference/max": 0.6852152347564697, + "sampling/sampling_logp_difference/mean": 0.014371512457728386, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 181.703125, + "completions/mean_terminated_length": 181.703125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.19912806153297424, + "epoch": 0.1948529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.316865502848925, + "kl": 0.006664983928203583, + "learning_rate": 6.422764227642276e-07, + "loss": 0.0183, + "num_tokens": 4908721.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8389919996261597, + "sampling/importance_sampling_ratio/mean": 0.9989259839057922, + "sampling/importance_sampling_ratio/min": 0.21832527220249176, + "sampling/sampling_logp_difference/max": 1.5217692852020264, + "sampling/sampling_logp_difference/mean": 0.01603008806705475, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 176.8125, + "completions/mean_terminated_length": 176.8125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.23629969358444214, + "epoch": 0.19607843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8826280925730605, + "kl": 0.011435983702540398, + "learning_rate": 6.463414634146342e-07, + "loss": -0.0818, + "num_tokens": 4939669.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8665122985839844, + "sampling/importance_sampling_ratio/mean": 0.9996390342712402, + "sampling/importance_sampling_ratio/min": 0.4956026077270508, + "sampling/sampling_logp_difference/max": 0.7019808292388916, + "sampling/sampling_logp_difference/mean": 0.016076810657978058, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 181.765625, + "completions/mean_terminated_length": 181.765625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.1463281810283661, + "epoch": 0.19730392156862744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028215489525451012, + "kl": 0.004451300948858261, + "learning_rate": 6.504065040650406e-07, + "loss": 0.0, + "num_tokens": 4968406.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001604557037354, + "sampling/importance_sampling_ratio/min": 0.07517139613628387, + "sampling/sampling_logp_difference/max": 2.587984561920166, + "sampling/sampling_logp_difference/mean": 0.01525675505399704, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 154.46875, + "completions/mean_terminated_length": 154.46875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2354339212179184, + "epoch": 0.19852941176470587, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.407889101784422, + "kl": 0.014467542991042137, + "learning_rate": 6.544715447154471e-07, + "loss": -0.0004, + "num_tokens": 4997012.0, + "reward": 0.21875, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.5366731882095337, + "sampling/importance_sampling_ratio/mean": 0.9997107982635498, + "sampling/importance_sampling_ratio/min": 0.387630432844162, + "sampling/sampling_logp_difference/max": 0.9477028846740723, + "sampling/sampling_logp_difference/mean": 0.016185423359274864, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 207.015625, + "completions/mean_terminated_length": 207.015625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.1951788365840912, + "epoch": 0.19975490196078433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044195899524379294, + "kl": 0.008431365713477135, + "learning_rate": 6.585365853658536e-07, + "loss": 0.0001, + "num_tokens": 5027269.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8064712285995483, + "sampling/importance_sampling_ratio/mean": 1.0001356601715088, + "sampling/importance_sampling_ratio/min": 0.42160293459892273, + "sampling/sampling_logp_difference/max": 0.8636913299560547, + "sampling/sampling_logp_difference/mean": 0.014173239469528198, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 192.015625, + "completions/mean_terminated_length": 192.015625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.1966334581375122, + "epoch": 0.20098039215686275, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5490323583576546, + "kl": 0.011206312105059624, + "learning_rate": 6.626016260162602e-07, + "loss": -0.0075, + "num_tokens": 5072278.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003595352172852, + "sampling/importance_sampling_ratio/min": 0.14512112736701965, + "sampling/sampling_logp_difference/max": 1.9301865100860596, + "sampling/sampling_logp_difference/mean": 0.015656569972634315, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 162.21875, + "completions/mean_terminated_length": 162.21875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.2012788951396942, + "epoch": 0.20220588235294118, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9594882642866283, + "kl": 0.009323729202151299, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0417, + "num_tokens": 5100372.0, + "reward": 0.71875, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.6112180948257446, + "sampling/importance_sampling_ratio/mean": 0.9993187189102173, + "sampling/importance_sampling_ratio/min": 0.49222439527511597, + "sampling/sampling_logp_difference/max": 0.7088205814361572, + "sampling/sampling_logp_difference/mean": 0.012670625001192093, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 127.234375, + "completions/mean_terminated_length": 127.234375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.18998035788536072, + "epoch": 0.2034313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.1797690247211605, + "kl": 0.010904636234045029, + "learning_rate": 6.707317073170731e-07, + "loss": 0.0096, + "num_tokens": 5123987.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.8535860776901245, + "sampling/importance_sampling_ratio/mean": 0.9999647736549377, + "sampling/importance_sampling_ratio/min": 0.4310276508331299, + "sampling/sampling_logp_difference/max": 0.8415830135345459, + "sampling/sampling_logp_difference/mean": 0.015727341175079346, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 170.921875, + "completions/mean_terminated_length": 170.921875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.21414750814437866, + "epoch": 0.20465686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03840811104929106, + "kl": 0.007046514190733433, + "learning_rate": 6.747967479674797e-07, + "loss": 0.0001, + "num_tokens": 5152254.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7908974885940552, + "sampling/importance_sampling_ratio/mean": 0.9999610781669617, + "sampling/importance_sampling_ratio/min": 0.546879768371582, + "sampling/sampling_logp_difference/max": 0.60352623462677, + "sampling/sampling_logp_difference/mean": 0.01575840264558792, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 180.25, + "completions/mean_terminated_length": 180.25, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.24589388072490692, + "epoch": 0.20588235294117646, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.3035277481025394, + "kl": 0.009675178676843643, + "learning_rate": 6.788617886178861e-07, + "loss": -0.0053, + "num_tokens": 5181406.0, + "reward": 0.34375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.9536131620407104, + "sampling/importance_sampling_ratio/mean": 0.9992693662643433, + "sampling/importance_sampling_ratio/min": 0.26408031582832336, + "sampling/sampling_logp_difference/max": 1.3315019607543945, + "sampling/sampling_logp_difference/mean": 0.01715805009007454, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 181.9375, + "completions/mean_terminated_length": 181.9375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.24018113315105438, + "epoch": 0.20710784313725492, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5100028986553524, + "kl": 0.0078086634166538715, + "learning_rate": 6.829268292682927e-07, + "loss": 0.0068, + "num_tokens": 5210938.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7270176410675049, + "sampling/importance_sampling_ratio/mean": 1.000520944595337, + "sampling/importance_sampling_ratio/min": 0.36654388904571533, + "sampling/sampling_logp_difference/max": 1.0036370754241943, + "sampling/sampling_logp_difference/mean": 0.01715228334069252, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 202.625, + "completions/mean_terminated_length": 202.625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.24454890191555023, + "epoch": 0.20833333333333334, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.099218301218686, + "kl": 0.007833386771380901, + "learning_rate": 6.869918699186991e-07, + "loss": 0.0956, + "num_tokens": 5244466.0, + "reward": 0.46875, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994741678237915, + "sampling/importance_sampling_ratio/min": 0.3991053104400635, + "sampling/sampling_logp_difference/max": 0.9185299873352051, + "sampling/sampling_logp_difference/mean": 0.01967196725308895, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 142.53125, + "completions/mean_terminated_length": 142.53125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.22392506897449493, + "epoch": 0.20955882352941177, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.2958306867135656, + "kl": 0.008253801614046097, + "learning_rate": 6.910569105691057e-07, + "loss": 0.0075, + "num_tokens": 5271508.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6007570028305054, + "sampling/importance_sampling_ratio/mean": 1.000047206878662, + "sampling/importance_sampling_ratio/min": 0.5157365202903748, + "sampling/sampling_logp_difference/max": 0.6621593236923218, + "sampling/sampling_logp_difference/mean": 0.015675466507673264, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 200.453125, + "completions/mean_terminated_length": 200.453125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.24501550197601318, + "epoch": 0.2107843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9957968776660147, + "kl": 0.00728217139840126, + "learning_rate": 6.951219512195121e-07, + "loss": 0.0107, + "num_tokens": 5305713.0, + "reward": 0.5625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.8588887453079224, + "sampling/importance_sampling_ratio/mean": 1.000617265701294, + "sampling/importance_sampling_ratio/min": 0.37776103615760803, + "sampling/sampling_logp_difference/max": 0.9734934568405151, + "sampling/sampling_logp_difference/mean": 0.016320761293172836, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 151.09375, + "completions/mean_terminated_length": 151.09375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.16363389790058136, + "epoch": 0.21200980392156862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06250278516304968, + "kl": 0.007543622981756926, + "learning_rate": 6.991869918699187e-07, + "loss": 0.0001, + "num_tokens": 5333559.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001720190048218, + "sampling/importance_sampling_ratio/min": 0.49989962577819824, + "sampling/sampling_logp_difference/max": 0.7407248020172119, + "sampling/sampling_logp_difference/mean": 0.012731247581541538, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 135.21875, + "completions/mean_terminated_length": 135.21875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.2280770242214203, + "epoch": 0.21323529411764705, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.4754516499699335, + "kl": 0.01365518569946289, + "learning_rate": 7.032520325203252e-07, + "loss": 0.0067, + "num_tokens": 5360981.0, + "reward": 0.28125, + "reward_std": 0.7129635810852051, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.41998329758644104, + "sampling/sampling_logp_difference/max": 0.8675403594970703, + "sampling/sampling_logp_difference/mean": 0.017936518415808678, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 192.390625, + "completions/mean_terminated_length": 192.390625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.21126362681388855, + "epoch": 0.21446078431372548, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.480997335273884, + "kl": 0.009101223200559616, + "learning_rate": 7.073170731707316e-07, + "loss": -0.0795, + "num_tokens": 5394542.0, + "reward": 0.3125, + "reward_std": 0.6143567562103271, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.9185150861740112, + "sampling/importance_sampling_ratio/mean": 0.9991146326065063, + "sampling/importance_sampling_ratio/min": 0.549663782119751, + "sampling/sampling_logp_difference/max": 0.6515514850616455, + "sampling/sampling_logp_difference/mean": 0.014745143242180347, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 146.90625, + "completions/mean_terminated_length": 146.90625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.2415069192647934, + "epoch": 0.21568627450980393, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.4862652865416726, + "kl": 0.013575580902397633, + "learning_rate": 7.113821138211382e-07, + "loss": -0.099, + "num_tokens": 5426520.0, + "reward": 0.3125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.9987435340881348, + "sampling/importance_sampling_ratio/mean": 0.9995778799057007, + "sampling/importance_sampling_ratio/min": 0.38637208938598633, + "sampling/sampling_logp_difference/max": 0.9509544372558594, + "sampling/sampling_logp_difference/mean": 0.018081510439515114, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 162.015625, + "completions/mean_terminated_length": 162.015625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.18460117280483246, + "epoch": 0.21691176470588236, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.583307746648768, + "kl": 0.00942918099462986, + "learning_rate": 7.154471544715447e-07, + "loss": 0.0146, + "num_tokens": 5459801.0, + "reward": -0.0625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.9393787384033203, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.3568578064441681, + "sampling/sampling_logp_difference/max": 1.0304179191589355, + "sampling/sampling_logp_difference/mean": 0.015044205822050571, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 163.578125, + "completions/mean_terminated_length": 163.578125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.15562379360198975, + "epoch": 0.2181372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 14.920599056075442, + "kl": 0.007000989280641079, + "learning_rate": 7.195121951219512e-07, + "loss": -0.1402, + "num_tokens": 5490590.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997345209121704, + "sampling/importance_sampling_ratio/min": 0.37634479999542236, + "sampling/sampling_logp_difference/max": 0.9772495627403259, + "sampling/sampling_logp_difference/mean": 0.01211632415652275, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 143.28125, + "completions/mean_terminated_length": 143.28125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.17127113044261932, + "epoch": 0.2193627450980392, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.207770064939479, + "kl": 0.014192151837050915, + "learning_rate": 7.235772357723577e-07, + "loss": -0.1414, + "num_tokens": 5525120.0, + "reward": -0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003180503845215, + "sampling/importance_sampling_ratio/min": 0.3161073923110962, + "sampling/sampling_logp_difference/max": 1.1516733169555664, + "sampling/sampling_logp_difference/mean": 0.0200350321829319, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 145.21875, + "completions/mean_terminated_length": 145.21875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.18500688672065735, + "epoch": 0.22058823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7760482190945632, + "kl": 0.010376361198723316, + "learning_rate": 7.276422764227642e-07, + "loss": -0.0076, + "num_tokens": 5554158.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.9286831617355347, + "sampling/importance_sampling_ratio/mean": 1.0003862380981445, + "sampling/importance_sampling_ratio/min": 0.3744829595088959, + "sampling/sampling_logp_difference/max": 0.9822089672088623, + "sampling/sampling_logp_difference/mean": 0.01596665009856224, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 163.734375, + "completions/mean_terminated_length": 163.734375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.16916601359844208, + "epoch": 0.22181372549019607, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1885559148855194, + "kl": 0.014154801145195961, + "learning_rate": 7.317073170731707e-07, + "loss": 0.0202, + "num_tokens": 5580589.0, + "reward": 0.21875, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.9891959428787231, + "sampling/importance_sampling_ratio/mean": 0.9989780783653259, + "sampling/importance_sampling_ratio/min": 0.47438451647758484, + "sampling/sampling_logp_difference/max": 0.7457370758056641, + "sampling/sampling_logp_difference/mean": 0.013993790373206139, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 135.75, + "completions/mean_terminated_length": 135.75, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.17388153076171875, + "epoch": 0.22303921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9196768856258746, + "kl": 0.009921906515955925, + "learning_rate": 7.357723577235772e-07, + "loss": 0.0211, + "num_tokens": 5611837.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.752700686454773, + "sampling/importance_sampling_ratio/mean": 0.9996520280838013, + "sampling/importance_sampling_ratio/min": 0.5270344018936157, + "sampling/sampling_logp_difference/max": 0.6404894590377808, + "sampling/sampling_logp_difference/mean": 0.013768360950052738, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 195.03125, + "completions/mean_terminated_length": 195.03125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.16102421283721924, + "epoch": 0.22426470588235295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0247183771509286, + "kl": 0.006647592410445213, + "learning_rate": 7.398373983739837e-07, + "loss": 0.0001, + "num_tokens": 5644767.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.998921275138855, + "sampling/importance_sampling_ratio/min": 0.24369849264621735, + "sampling/sampling_logp_difference/max": 1.4118235111236572, + "sampling/sampling_logp_difference/mean": 0.014862995594739914, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 136.203125, + "completions/mean_terminated_length": 136.203125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.2119152545928955, + "epoch": 0.22549019607843138, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.434777201904055, + "kl": 0.01427440531551838, + "learning_rate": 7.439024390243903e-07, + "loss": 0.0337, + "num_tokens": 5668364.0, + "reward": -0.125, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006811618804932, + "sampling/importance_sampling_ratio/min": 0.4294593632221222, + "sampling/sampling_logp_difference/max": 0.9697303771972656, + "sampling/sampling_logp_difference/mean": 0.01680961810052395, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 147.0625, + "completions/mean_terminated_length": 147.0625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.20346541702747345, + "epoch": 0.2267156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5941225362087368, + "kl": 0.0125267980620265, + "learning_rate": 7.479674796747967e-07, + "loss": -0.0058, + "num_tokens": 5695232.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.9195092916488647, + "sampling/importance_sampling_ratio/mean": 1.0003814697265625, + "sampling/importance_sampling_ratio/min": 0.4757547974586487, + "sampling/sampling_logp_difference/max": 0.7428526878356934, + "sampling/sampling_logp_difference/mean": 0.01674540340900421, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 162.796875, + "completions/mean_terminated_length": 162.796875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.15275320410728455, + "epoch": 0.22794117647058823, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.866930979749354, + "kl": 0.12918700277805328, + "learning_rate": 7.520325203252032e-07, + "loss": 0.0253, + "num_tokens": 5726259.0, + "reward": 0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997475147247314, + "sampling/importance_sampling_ratio/min": 0.001375882071442902, + "sampling/sampling_logp_difference/max": 6.58866024017334, + "sampling/sampling_logp_difference/mean": 0.013679513707756996, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 217.859375, + "completions/mean_terminated_length": 217.859375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.25289398431777954, + "epoch": 0.22916666666666666, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.257704337020236, + "kl": 0.008466934785246849, + "learning_rate": 7.560975609756097e-07, + "loss": 0.0071, + "num_tokens": 5758778.0, + "reward": 0.65625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.8894721269607544, + "sampling/importance_sampling_ratio/mean": 0.9993259906768799, + "sampling/importance_sampling_ratio/min": 0.29954442381858826, + "sampling/sampling_logp_difference/max": 1.2054924964904785, + "sampling/sampling_logp_difference/mean": 0.017889931797981262, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 170.09375, + "completions/mean_terminated_length": 170.09375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.23489050567150116, + "epoch": 0.23039215686274508, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.3651801574179805, + "kl": 0.0169069841504097, + "learning_rate": 7.601626016260162e-07, + "loss": 0.0554, + "num_tokens": 5796736.0, + "reward": -0.0625, + "reward_std": 0.5765564441680908, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996923208236694, + "sampling/importance_sampling_ratio/min": 0.3697149455547333, + "sampling/sampling_logp_difference/max": 1.2095823287963867, + "sampling/sampling_logp_difference/mean": 0.018920443952083588, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 151.546875, + "completions/mean_terminated_length": 151.546875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.19983212649822235, + "epoch": 0.23161764705882354, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.7491469653398797, + "kl": 0.018551960587501526, + "learning_rate": 7.642276422764228e-07, + "loss": 0.0006, + "num_tokens": 5822291.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.9441649913787842, + "sampling/importance_sampling_ratio/mean": 1.0001943111419678, + "sampling/importance_sampling_ratio/min": 0.3973112106323242, + "sampling/sampling_logp_difference/max": 0.9230353832244873, + "sampling/sampling_logp_difference/mean": 0.015498116612434387, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 122.140625, + "completions/mean_terminated_length": 122.140625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.17167788743972778, + "epoch": 0.23284313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0839651393069176, + "kl": 0.015019101090729237, + "learning_rate": 7.682926829268292e-07, + "loss": -0.0065, + "num_tokens": 5845404.0, + "reward": -0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.9676556587219238, + "sampling/importance_sampling_ratio/mean": 1.0010113716125488, + "sampling/importance_sampling_ratio/min": 0.4645615816116333, + "sampling/sampling_logp_difference/max": 0.7666611671447754, + "sampling/sampling_logp_difference/mean": 0.014353152364492416, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 152.03125, + "completions/mean_terminated_length": 152.03125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.21971988677978516, + "epoch": 0.2340686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.086321636924277, + "kl": 0.011947219260036945, + "learning_rate": 7.723577235772358e-07, + "loss": 0.0184, + "num_tokens": 5877198.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994181394577026, + "sampling/importance_sampling_ratio/min": 0.38398805260658264, + "sampling/sampling_logp_difference/max": 1.4673542976379395, + "sampling/sampling_logp_difference/mean": 0.017348386347293854, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 163.015625, + "completions/mean_terminated_length": 163.015625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.20136642456054688, + "epoch": 0.23529411764705882, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7120380707996077, + "kl": 0.008222850039601326, + "learning_rate": 7.764227642276422e-07, + "loss": 0.0271, + "num_tokens": 5902815.0, + "reward": -0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004281997680664, + "sampling/importance_sampling_ratio/min": 0.3861261010169983, + "sampling/sampling_logp_difference/max": 0.9515912532806396, + "sampling/sampling_logp_difference/mean": 0.01523653045296669, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.20873147249221802, + "epoch": 0.23651960784313725, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.466968744792855, + "kl": 0.007534465286880732, + "learning_rate": 7.804878048780488e-07, + "loss": -0.0454, + "num_tokens": 5945145.0, + "reward": 0.75, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992533922195435, + "sampling/importance_sampling_ratio/min": 0.13629940152168274, + "sampling/sampling_logp_difference/max": 1.99290132522583, + "sampling/sampling_logp_difference/mean": 0.014972115866839886, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.18233969807624817, + "epoch": 0.23774509803921567, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9686086293141357, + "kl": 0.008675255812704563, + "learning_rate": 7.845528455284552e-07, + "loss": 0.0009, + "num_tokens": 5978467.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998888373374939, + "sampling/importance_sampling_ratio/min": 0.28685885667800903, + "sampling/sampling_logp_difference/max": 1.248764991760254, + "sampling/sampling_logp_difference/mean": 0.013606593944132328, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 218.828125, + "completions/mean_terminated_length": 218.828125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.1996522843837738, + "epoch": 0.23897058823529413, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6731080287121975, + "kl": 0.014219471253454685, + "learning_rate": 7.886178861788617e-07, + "loss": -0.0101, + "num_tokens": 6011416.0, + "reward": 0.59375, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9991976022720337, + "sampling/importance_sampling_ratio/min": 0.33685335516929626, + "sampling/sampling_logp_difference/max": 1.10807204246521, + "sampling/sampling_logp_difference/mean": 0.01521037332713604, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 155.84375, + "completions/mean_terminated_length": 155.84375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.13060662150382996, + "epoch": 0.24019607843137256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0643870022708985, + "kl": 0.008615046739578247, + "learning_rate": 7.926829268292683e-07, + "loss": 0.0001, + "num_tokens": 6037774.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9991104602813721, + "sampling/importance_sampling_ratio/min": 0.2513620853424072, + "sampling/sampling_logp_difference/max": 1.3808608055114746, + "sampling/sampling_logp_difference/mean": 0.0147407790645957, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 161.890625, + "completions/mean_terminated_length": 161.890625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.23081961274147034, + "epoch": 0.24142156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.3276566035807305, + "kl": 0.018241455778479576, + "learning_rate": 7.967479674796747e-07, + "loss": 0.0068, + "num_tokens": 6067831.0, + "reward": 0.25, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001935958862305, + "sampling/importance_sampling_ratio/min": 0.5692372918128967, + "sampling/sampling_logp_difference/max": 0.9607486724853516, + "sampling/sampling_logp_difference/mean": 0.016515467315912247, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 193.53125, + "completions/mean_terminated_length": 193.53125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.20671209692955017, + "epoch": 0.2426470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9772604678970633, + "kl": 0.012380264699459076, + "learning_rate": 8.008130081300813e-07, + "loss": 0.0614, + "num_tokens": 6093689.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.8159511089324951, + "sampling/importance_sampling_ratio/mean": 1.0001795291900635, + "sampling/importance_sampling_ratio/min": 0.5512356162071228, + "sampling/sampling_logp_difference/max": 0.596609354019165, + "sampling/sampling_logp_difference/mean": 0.014003811404109001, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 172.53125, + "completions/mean_terminated_length": 172.53125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.18476727604866028, + "epoch": 0.24387254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9137306340793396, + "kl": 0.011842640116810799, + "learning_rate": 8.048780487804878e-07, + "loss": 0.0917, + "num_tokens": 6123595.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.7520594596862793, + "sampling/importance_sampling_ratio/mean": 1.0005278587341309, + "sampling/importance_sampling_ratio/min": 0.416149377822876, + "sampling/sampling_logp_difference/max": 0.8767110109329224, + "sampling/sampling_logp_difference/mean": 0.014176880940794945, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 224.5625, + "completions/mean_terminated_length": 224.5625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.21131455898284912, + "epoch": 0.24509803921568626, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1316313504215345, + "kl": 0.01181216724216938, + "learning_rate": 8.089430894308943e-07, + "loss": 0.019, + "num_tokens": 6163455.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.9490059614181519, + "sampling/importance_sampling_ratio/mean": 0.9999033212661743, + "sampling/importance_sampling_ratio/min": 0.480782151222229, + "sampling/sampling_logp_difference/max": 0.7323410511016846, + "sampling/sampling_logp_difference/mean": 0.01573982834815979, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 165.390625, + "completions/mean_terminated_length": 165.390625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.23284420371055603, + "epoch": 0.24632352941176472, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.203527965455497, + "kl": 0.012116186320781708, + "learning_rate": 8.130081300813008e-07, + "loss": -0.0002, + "num_tokens": 6191192.0, + "reward": 0.15625, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.7609137296676636, + "sampling/importance_sampling_ratio/mean": 1.0000020265579224, + "sampling/importance_sampling_ratio/min": 0.44058364629745483, + "sampling/sampling_logp_difference/max": 0.8196549415588379, + "sampling/sampling_logp_difference/mean": 0.01848536543548107, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 176.109375, + "completions/mean_terminated_length": 176.109375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.1951962411403656, + "epoch": 0.24754901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2565005795965583, + "kl": 0.015520875342190266, + "learning_rate": 8.170731707317072e-07, + "loss": -0.0195, + "num_tokens": 6221775.0, + "reward": 0.0625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992324113845825, + "sampling/importance_sampling_ratio/min": 0.5110778212547302, + "sampling/sampling_logp_difference/max": 0.8343586921691895, + "sampling/sampling_logp_difference/mean": 0.015476308763027191, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 131.625, + "completions/mean_terminated_length": 131.625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.17135287821292877, + "epoch": 0.24877450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.5963026554811197, + "kl": 0.020184125751256943, + "learning_rate": 8.211382113821138e-07, + "loss": 0.0503, + "num_tokens": 6245191.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6214877367019653, + "sampling/importance_sampling_ratio/mean": 0.9992965459823608, + "sampling/importance_sampling_ratio/min": 0.4405844509601593, + "sampling/sampling_logp_difference/max": 0.8196531534194946, + "sampling/sampling_logp_difference/mean": 0.01627563126385212, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 147.78125, + "completions/mean_terminated_length": 147.78125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.17383095622062683, + "epoch": 0.25, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3624706522297745, + "kl": 0.015773724764585495, + "learning_rate": 8.252032520325202e-07, + "loss": -0.0004, + "num_tokens": 6276105.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999467134475708, + "sampling/importance_sampling_ratio/min": 0.4982483386993408, + "sampling/sampling_logp_difference/max": 0.7709488868713379, + "sampling/sampling_logp_difference/mean": 0.013784103095531464, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 140.171875, + "completions/mean_terminated_length": 140.171875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.23402664065361023, + "epoch": 0.2512254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3533101667704015, + "kl": 0.019832585006952286, + "learning_rate": 8.292682926829268e-07, + "loss": -0.0299, + "num_tokens": 6300468.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006000995635986, + "sampling/importance_sampling_ratio/min": 0.10227715969085693, + "sampling/sampling_logp_difference/max": 2.280068874359131, + "sampling/sampling_logp_difference/mean": 0.01633118838071823, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 165.015625, + "completions/mean_terminated_length": 165.015625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.1390296220779419, + "epoch": 0.25245098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05031020405828662, + "kl": 0.011352060362696648, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0001, + "num_tokens": 6331157.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.615748405456543, + "sampling/importance_sampling_ratio/mean": 1.000266432762146, + "sampling/importance_sampling_ratio/min": 0.3780434727668762, + "sampling/sampling_logp_difference/max": 0.9727461338043213, + "sampling/sampling_logp_difference/mean": 0.010921423323452473, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 202.765625, + "completions/mean_terminated_length": 202.765625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2006179839372635, + "epoch": 0.2536764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3644209320380678, + "kl": 0.008472388610243797, + "learning_rate": 8.373983739837398e-07, + "loss": 0.042, + "num_tokens": 6362310.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.710673451423645, + "sampling/importance_sampling_ratio/mean": 0.9995939135551453, + "sampling/importance_sampling_ratio/min": 0.490130215883255, + "sampling/sampling_logp_difference/max": 0.7130842208862305, + "sampling/sampling_logp_difference/mean": 0.015032893046736717, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 164.96875, + "completions/mean_terminated_length": 164.96875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.22202682495117188, + "epoch": 0.2549019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.177038224537697, + "kl": 0.020616548135876656, + "learning_rate": 8.414634146341463e-07, + "loss": 0.0217, + "num_tokens": 6387444.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997557401657104, + "sampling/importance_sampling_ratio/min": 0.50432288646698, + "sampling/sampling_logp_difference/max": 0.7088422775268555, + "sampling/sampling_logp_difference/mean": 0.015844713896512985, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 141.640625, + "completions/mean_terminated_length": 141.640625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.2225850522518158, + "epoch": 0.25612745098039214, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.6814906793295235, + "kl": 0.01864427700638771, + "learning_rate": 8.455284552845529e-07, + "loss": 0.0033, + "num_tokens": 6411549.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.8062304258346558, + "sampling/importance_sampling_ratio/mean": 1.0005677938461304, + "sampling/importance_sampling_ratio/min": 0.5038042068481445, + "sampling/sampling_logp_difference/max": 0.6855676174163818, + "sampling/sampling_logp_difference/mean": 0.017111271619796753, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 142.28125, + "completions/mean_terminated_length": 142.28125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.1608240008354187, + "epoch": 0.25735294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.018945743347563, + "kl": 0.013830848038196564, + "learning_rate": 8.495934959349593e-07, + "loss": 0.0104, + "num_tokens": 6438687.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9990195631980896, + "sampling/importance_sampling_ratio/min": 0.5116662383079529, + "sampling/sampling_logp_difference/max": 0.7109653949737549, + "sampling/sampling_logp_difference/mean": 0.014174254611134529, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 146.390625, + "completions/mean_terminated_length": 146.390625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.20424455404281616, + "epoch": 0.25857843137254904, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8196388901766183, + "kl": 0.01573394425213337, + "learning_rate": 8.536585365853657e-07, + "loss": 0.0117, + "num_tokens": 6466040.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.9441547393798828, + "sampling/importance_sampling_ratio/mean": 1.0002039670944214, + "sampling/importance_sampling_ratio/min": 0.23401731252670288, + "sampling/sampling_logp_difference/max": 1.4523601531982422, + "sampling/sampling_logp_difference/mean": 0.01648673601448536, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 167.421875, + "completions/mean_terminated_length": 167.421875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.17787525057792664, + "epoch": 0.25980392156862747, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.6141797340196433, + "kl": 0.011871451511979103, + "learning_rate": 8.577235772357723e-07, + "loss": -0.0182, + "num_tokens": 6491203.0, + "reward": 0.4375, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9012318849563599, + "sampling/importance_sampling_ratio/mean": 0.9989027380943298, + "sampling/importance_sampling_ratio/min": 0.41614916920661926, + "sampling/sampling_logp_difference/max": 0.8767114877700806, + "sampling/sampling_logp_difference/mean": 0.015307286754250526, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 197.71875, + "completions/mean_terminated_length": 197.71875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.18765902519226074, + "epoch": 0.2610294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.643178268300292, + "kl": 0.015506663359701633, + "learning_rate": 8.617886178861788e-07, + "loss": 0.031, + "num_tokens": 6525969.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998393058776855, + "sampling/importance_sampling_ratio/min": 0.2954738736152649, + "sampling/sampling_logp_difference/max": 1.219174861907959, + "sampling/sampling_logp_difference/mean": 0.01343243382871151, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 173.984375, + "completions/mean_terminated_length": 173.984375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.2047746777534485, + "epoch": 0.2622549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.5217419801274326, + "kl": 0.022509688511490822, + "learning_rate": 8.658536585365853e-07, + "loss": 0.0669, + "num_tokens": 6561760.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999178647994995, + "sampling/importance_sampling_ratio/min": 0.5406491160392761, + "sampling/sampling_logp_difference/max": 0.8521547317504883, + "sampling/sampling_logp_difference/mean": 0.01589590311050415, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 161.78125, + "completions/mean_terminated_length": 161.78125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.19016176462173462, + "epoch": 0.26348039215686275, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9519911585691987, + "kl": 0.021345844492316246, + "learning_rate": 8.699186991869918e-07, + "loss": 0.0248, + "num_tokens": 6599810.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6213493347167969, + "sampling/importance_sampling_ratio/mean": 0.9995770454406738, + "sampling/importance_sampling_ratio/min": 0.444172203540802, + "sampling/sampling_logp_difference/max": 0.8115429878234863, + "sampling/sampling_logp_difference/mean": 0.016685977578163147, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 210.53125, + "completions/mean_terminated_length": 210.53125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.16471557319164276, + "epoch": 0.2647058823529412, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8635683806955845, + "kl": 0.01851458102464676, + "learning_rate": 8.739837398373984e-07, + "loss": -0.1021, + "num_tokens": 6631892.0, + "reward": 0.1875, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002044439315796, + "sampling/importance_sampling_ratio/min": 0.38777610659599304, + "sampling/sampling_logp_difference/max": 0.9473271369934082, + "sampling/sampling_logp_difference/mean": 0.013975502923130989, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 185.1875, + "completions/mean_terminated_length": 185.1875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.16010968387126923, + "epoch": 0.2659313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6897227310673995, + "kl": 0.01930549368262291, + "learning_rate": 8.780487804878048e-07, + "loss": -0.0573, + "num_tokens": 6662816.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.821642518043518, + "sampling/importance_sampling_ratio/mean": 1.0006334781646729, + "sampling/importance_sampling_ratio/min": 0.4810205101966858, + "sampling/sampling_logp_difference/max": 0.7318453788757324, + "sampling/sampling_logp_difference/mean": 0.012054579332470894, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 141.34375, + "completions/mean_terminated_length": 141.34375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.17018850147724152, + "epoch": 0.26715686274509803, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0365420752399577, + "kl": 0.0233754962682724, + "learning_rate": 8.821138211382113e-07, + "loss": -0.0094, + "num_tokens": 6693078.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001654624938965, + "sampling/importance_sampling_ratio/min": 0.40650513768196106, + "sampling/sampling_logp_difference/max": 0.9001587629318237, + "sampling/sampling_logp_difference/mean": 0.013843964785337448, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 167.828125, + "completions/mean_terminated_length": 167.828125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.18336881697177887, + "epoch": 0.26838235294117646, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.934212915296, + "kl": 0.01532343216240406, + "learning_rate": 8.861788617886179e-07, + "loss": 0.0647, + "num_tokens": 6720331.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003345012664795, + "sampling/importance_sampling_ratio/min": 0.4310505986213684, + "sampling/sampling_logp_difference/max": 1.2630889415740967, + "sampling/sampling_logp_difference/mean": 0.014666068367660046, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 145.59375, + "completions/mean_terminated_length": 145.59375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.19705787301063538, + "epoch": 0.2696078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9169989542169759, + "kl": 0.02064172551035881, + "learning_rate": 8.902439024390244e-07, + "loss": 0.0325, + "num_tokens": 6747377.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996950030326843, + "sampling/importance_sampling_ratio/min": 0.4441492259502411, + "sampling/sampling_logp_difference/max": 0.8115947246551514, + "sampling/sampling_logp_difference/mean": 0.014506572857499123, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 197.296875, + "completions/mean_terminated_length": 197.296875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.26085400581359863, + "epoch": 0.2708333333333333, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.222035021598264, + "kl": 0.03441544622182846, + "learning_rate": 8.943089430894308e-07, + "loss": 0.0238, + "num_tokens": 6775668.0, + "reward": -0.40625, + "reward_std": 0.565913200378418, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.9800100326538086, + "sampling/importance_sampling_ratio/mean": 1.0002212524414062, + "sampling/importance_sampling_ratio/min": 0.35668912529945374, + "sampling/sampling_logp_difference/max": 1.030890703201294, + "sampling/sampling_logp_difference/mean": 0.01797478087246418, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 116.515625, + "completions/mean_terminated_length": 116.515625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.15425002574920654, + "epoch": 0.27205882352941174, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.856688339430402, + "kl": 0.03441350907087326, + "learning_rate": 8.983739837398373e-07, + "loss": -0.0177, + "num_tokens": 6802933.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.6629855632781982, + "sampling/importance_sampling_ratio/mean": 1.0002639293670654, + "sampling/importance_sampling_ratio/min": 0.3411581218242645, + "sampling/sampling_logp_difference/max": 1.075409173965454, + "sampling/sampling_logp_difference/mean": 0.013100311160087585, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 174.375, + "completions/mean_terminated_length": 174.375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.21026542782783508, + "epoch": 0.27328431372549017, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2633458308151913, + "kl": 0.02543017454445362, + "learning_rate": 9.024390243902439e-07, + "loss": -0.0153, + "num_tokens": 6833341.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999900221824646, + "sampling/importance_sampling_ratio/min": 0.3941422700881958, + "sampling/sampling_logp_difference/max": 0.9310433864593506, + "sampling/sampling_logp_difference/mean": 0.016453826799988747, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 144.03125, + "completions/mean_terminated_length": 144.03125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.19323155283927917, + "epoch": 0.27450980392156865, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.238740050855254, + "kl": 0.041469473391771317, + "learning_rate": 9.065040650406503e-07, + "loss": 0.0099, + "num_tokens": 6861551.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000789165496826, + "sampling/importance_sampling_ratio/min": 0.3232390582561493, + "sampling/sampling_logp_difference/max": 1.1293630599975586, + "sampling/sampling_logp_difference/mean": 0.015687420964241028, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 164.359375, + "completions/mean_terminated_length": 164.359375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.14599479734897614, + "epoch": 0.2757352941176471, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8989134891792927, + "kl": 0.018956230953335762, + "learning_rate": 9.105691056910569e-07, + "loss": -0.0124, + "num_tokens": 6887334.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.817976713180542, + "sampling/importance_sampling_ratio/mean": 0.9996324777603149, + "sampling/importance_sampling_ratio/min": 0.376531183719635, + "sampling/sampling_logp_difference/max": 0.9767544269561768, + "sampling/sampling_logp_difference/mean": 0.012233897112309933, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 136.34375, + "completions/mean_terminated_length": 136.34375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.2081795334815979, + "epoch": 0.2769607843137255, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.678218026388671, + "kl": 0.03165192902088165, + "learning_rate": 9.146341463414634e-07, + "loss": 0.0147, + "num_tokens": 6912236.0, + "reward": 0.125, + "reward_std": 0.5651718378067017, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6185600757598877, + "sampling/importance_sampling_ratio/mean": 1.0005383491516113, + "sampling/importance_sampling_ratio/min": 0.6078590154647827, + "sampling/sampling_logp_difference/max": 0.49781227111816406, + "sampling/sampling_logp_difference/mean": 0.015303589403629303, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 139.28125, + "completions/mean_terminated_length": 139.28125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.1498456597328186, + "epoch": 0.27818627450980393, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.3959461708268885, + "kl": 0.05385677143931389, + "learning_rate": 9.186991869918699e-07, + "loss": 0.0689, + "num_tokens": 6938574.0, + "reward": 0.375, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.527761459350586, + "sampling/importance_sampling_ratio/mean": 0.9992302060127258, + "sampling/importance_sampling_ratio/min": 0.0881451815366745, + "sampling/sampling_logp_difference/max": 2.428770065307617, + "sampling/sampling_logp_difference/mean": 0.015360962599515915, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 170.875, + "completions/mean_terminated_length": 170.875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.16509565711021423, + "epoch": 0.27941176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08382376646922701, + "kl": 0.018290940672159195, + "learning_rate": 9.227642276422763e-07, + "loss": 0.0002, + "num_tokens": 6971622.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.769323706626892, + "sampling/importance_sampling_ratio/mean": 0.9997942447662354, + "sampling/importance_sampling_ratio/min": 0.5412097573280334, + "sampling/sampling_logp_difference/max": 0.6139483451843262, + "sampling/sampling_logp_difference/mean": 0.013449668884277344, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 152.71875, + "completions/mean_terminated_length": 152.71875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 0.2042853981256485, + "epoch": 0.2806372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.8192731831295728, + "kl": 0.021124407649040222, + "learning_rate": 9.26829268292683e-07, + "loss": -0.0314, + "num_tokens": 7001028.0, + "reward": -0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000542402267456, + "sampling/importance_sampling_ratio/min": 0.15975400805473328, + "sampling/sampling_logp_difference/max": 2.1725785732269287, + "sampling/sampling_logp_difference/mean": 0.017363186925649643, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 156.75, + "completions/mean_terminated_length": 156.75, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.19672785699367523, + "epoch": 0.2818627450980392, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.229775984302137, + "kl": 0.014601796865463257, + "learning_rate": 9.308943089430894e-07, + "loss": -0.0424, + "num_tokens": 7041636.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995140433311462, + "sampling/importance_sampling_ratio/min": 0.398908406496048, + "sampling/sampling_logp_difference/max": 0.9190235137939453, + "sampling/sampling_logp_difference/mean": 0.016202237457036972, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 180.875, + "completions/mean_terminated_length": 180.875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.12926724553108215, + "epoch": 0.28308823529411764, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5619613064277984, + "kl": 0.02092336118221283, + "learning_rate": 9.349593495934958e-07, + "loss": 0.0194, + "num_tokens": 7070924.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.8715226650238037, + "sampling/importance_sampling_ratio/mean": 1.000416874885559, + "sampling/importance_sampling_ratio/min": 0.38201507925987244, + "sampling/sampling_logp_difference/max": 0.9622951745986938, + "sampling/sampling_logp_difference/mean": 0.010455417446792126, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 131.3125, + "completions/mean_terminated_length": 131.3125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.23128622770309448, + "epoch": 0.28431372549019607, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.84886119094527, + "kl": 0.019218452274799347, + "learning_rate": 9.390243902439024e-07, + "loss": -0.0039, + "num_tokens": 7099472.0, + "reward": 0.25, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001335144042969, + "sampling/importance_sampling_ratio/min": 0.488338440656662, + "sampling/sampling_logp_difference/max": 0.7325534820556641, + "sampling/sampling_logp_difference/mean": 0.01647172123193741, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 149.125, + "completions/mean_terminated_length": 149.125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.1792403757572174, + "epoch": 0.2855392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8270636713308035, + "kl": 0.013000808656215668, + "learning_rate": 9.430894308943089e-07, + "loss": -0.0049, + "num_tokens": 7123976.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997499585151672, + "sampling/importance_sampling_ratio/min": 0.49598950147628784, + "sampling/sampling_logp_difference/max": 0.7306704521179199, + "sampling/sampling_logp_difference/mean": 0.013863112777471542, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 196.9375, + "completions/mean_terminated_length": 196.9375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.21675468981266022, + "epoch": 0.2867647058823529, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.4134308604205, + "kl": 0.022405382245779037, + "learning_rate": 9.471544715447154e-07, + "loss": 0.0002, + "num_tokens": 7158916.0, + "reward": 0.625, + "reward_std": 0.6047805547714233, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.602049708366394, + "sampling/importance_sampling_ratio/mean": 0.9996261596679688, + "sampling/importance_sampling_ratio/min": 0.4933438301086426, + "sampling/sampling_logp_difference/max": 0.7065489292144775, + "sampling/sampling_logp_difference/mean": 0.016713209450244904, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 148.296875, + "completions/mean_terminated_length": 148.296875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.1597084403038025, + "epoch": 0.28799019607843135, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7514324834207053, + "kl": 0.016031622886657715, + "learning_rate": 9.512195121951218e-07, + "loss": -0.0055, + "num_tokens": 7187783.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997620582580566, + "sampling/importance_sampling_ratio/min": 0.4871583580970764, + "sampling/sampling_logp_difference/max": 0.7891626358032227, + "sampling/sampling_logp_difference/mean": 0.012351596727967262, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 137.40625, + "completions/mean_terminated_length": 137.40625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.2350776195526123, + "epoch": 0.28921568627450983, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.2546834242240266, + "kl": 0.023296181112527847, + "learning_rate": 9.552845528455285e-07, + "loss": -0.0099, + "num_tokens": 7211105.0, + "reward": 0.15625, + "reward_std": 0.7129635810852051, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.7656652927398682, + "sampling/importance_sampling_ratio/mean": 1.000077724456787, + "sampling/importance_sampling_ratio/min": 0.3414660096168518, + "sampling/sampling_logp_difference/max": 1.0745071172714233, + "sampling/sampling_logp_difference/mean": 0.016961853951215744, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 212.578125, + "completions/mean_terminated_length": 212.578125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.22627148032188416, + "epoch": 0.29044117647058826, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2184433906931282, + "kl": 0.013846682384610176, + "learning_rate": 9.59349593495935e-07, + "loss": -0.0401, + "num_tokens": 7259302.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.998911440372467, + "sampling/importance_sampling_ratio/min": 0.47263118624687195, + "sampling/sampling_logp_difference/max": 0.8971564769744873, + "sampling/sampling_logp_difference/mean": 0.01815243996679783, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 195.453125, + "completions/mean_terminated_length": 195.453125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.20510563254356384, + "epoch": 0.2916666666666667, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5715892032303334, + "kl": 0.02353559248149395, + "learning_rate": 9.634146341463414e-07, + "loss": 0.0412, + "num_tokens": 7286035.0, + "reward": 0.0625, + "reward_std": 0.5738953948020935, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.990490198135376, + "sampling/importance_sampling_ratio/mean": 1.000231146812439, + "sampling/importance_sampling_ratio/min": 0.3369840681552887, + "sampling/sampling_logp_difference/max": 1.0877196788787842, + "sampling/sampling_logp_difference/mean": 0.01640462502837181, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 190.953125, + "completions/mean_terminated_length": 190.953125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.16903972625732422, + "epoch": 0.2928921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.351155146835451, + "kl": 0.013523912988603115, + "learning_rate": 9.67479674796748e-07, + "loss": 0.0591, + "num_tokens": 7318320.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995356798171997, + "sampling/importance_sampling_ratio/min": 0.1982087939977646, + "sampling/sampling_logp_difference/max": 1.6184343099594116, + "sampling/sampling_logp_difference/mean": 0.014733832329511642, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 240.25, + "completions/mean_terminated_length": 240.25, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.203177809715271, + "epoch": 0.29411764705882354, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.4134120577411617, + "kl": 0.009299861267209053, + "learning_rate": 9.715447154471544e-07, + "loss": -0.0012, + "num_tokens": 7350944.0, + "reward": -0.03125, + "reward_std": 0.5431214570999146, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002261400222778, + "sampling/importance_sampling_ratio/min": 0.3533068597316742, + "sampling/sampling_logp_difference/max": 1.040418267250061, + "sampling/sampling_logp_difference/mean": 0.014737317338585854, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 172.859375, + "completions/mean_terminated_length": 172.859375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.2432369738817215, + "epoch": 0.29534313725490197, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8605392135504952, + "kl": 0.01595490612089634, + "learning_rate": 9.756097560975609e-07, + "loss": -0.0717, + "num_tokens": 7378039.0, + "reward": -0.09375, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005640983581543, + "sampling/importance_sampling_ratio/min": 0.6027071475982666, + "sampling/sampling_logp_difference/max": 0.784705638885498, + "sampling/sampling_logp_difference/mean": 0.015564032830297947, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 172.9375, + "completions/mean_terminated_length": 172.9375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.22795718908309937, + "epoch": 0.2965686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.4576001673744328, + "kl": 0.01724562793970108, + "learning_rate": 9.796747967479673e-07, + "loss": 0.0406, + "num_tokens": 7414931.0, + "reward": 0.625, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.7553232908248901, + "sampling/importance_sampling_ratio/mean": 0.9996345043182373, + "sampling/importance_sampling_ratio/min": 0.43154290318489075, + "sampling/sampling_logp_difference/max": 0.8403884172439575, + "sampling/sampling_logp_difference/mean": 0.017535878345370293, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 183.546875, + "completions/mean_terminated_length": 183.546875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.22533947229385376, + "epoch": 0.2977941176470588, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.1914303458815296, + "kl": 0.017026592046022415, + "learning_rate": 9.83739837398374e-07, + "loss": 0.1802, + "num_tokens": 7442566.0, + "reward": 0.375, + "reward_std": 0.6267197132110596, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997953772544861, + "sampling/importance_sampling_ratio/min": 0.08131000399589539, + "sampling/sampling_logp_difference/max": 2.509486198425293, + "sampling/sampling_logp_difference/mean": 0.01854054443538189, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 166.78125, + "completions/mean_terminated_length": 166.78125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.19965288043022156, + "epoch": 0.29901960784313725, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.9273974052382448, + "kl": 0.0182357020676136, + "learning_rate": 9.878048780487804e-07, + "loss": 0.0259, + "num_tokens": 7470472.0, + "reward": 0.75, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6897634267807007, + "sampling/importance_sampling_ratio/mean": 1.0001301765441895, + "sampling/importance_sampling_ratio/min": 0.5680594444274902, + "sampling/sampling_logp_difference/max": 0.5655292272567749, + "sampling/sampling_logp_difference/mean": 0.014180110767483711, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 222.296875, + "completions/mean_terminated_length": 222.296875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.21197420358657837, + "epoch": 0.3002450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9902050280315178, + "kl": 0.016592450439929962, + "learning_rate": 9.918699186991869e-07, + "loss": 0.0053, + "num_tokens": 7510923.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000331163406372, + "sampling/importance_sampling_ratio/min": 0.3722730576992035, + "sampling/sampling_logp_difference/max": 0.9881277084350586, + "sampling/sampling_logp_difference/mean": 0.016205525025725365, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 190.546875, + "completions/mean_terminated_length": 190.546875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.28110504150390625, + "epoch": 0.3014705882352941, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.9143400473740906, + "kl": 0.01969430036842823, + "learning_rate": 9.959349593495935e-07, + "loss": 0.0256, + "num_tokens": 7544686.0, + "reward": -0.03125, + "reward_std": 0.6683381795883179, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.797054648399353, + "sampling/importance_sampling_ratio/mean": 0.9998266696929932, + "sampling/importance_sampling_ratio/min": 0.3880440294742584, + "sampling/sampling_logp_difference/max": 0.946636438369751, + "sampling/sampling_logp_difference/mean": 0.01925182156264782, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 193.75, + "completions/mean_terminated_length": 193.75, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.10917695611715317, + "epoch": 0.30269607843137253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04571589721542751, + "kl": 0.013499004766345024, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 7572382.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000195503234863, + "sampling/importance_sampling_ratio/min": 0.20237791538238525, + "sampling/sampling_logp_difference/max": 1.5976184606552124, + "sampling/sampling_logp_difference/mean": 0.008364669978618622, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 163.328125, + "completions/mean_terminated_length": 163.328125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.22276531159877777, + "epoch": 0.30392156862745096, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2225595181548026, + "kl": 0.020731121301651, + "learning_rate": 9.99999492515838e-07, + "loss": -0.0246, + "num_tokens": 7600355.0, + "reward": -0.34375, + "reward_std": 0.4597553312778473, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992364048957825, + "sampling/importance_sampling_ratio/min": 0.5002468228340149, + "sampling/sampling_logp_difference/max": 0.7977039813995361, + "sampling/sampling_logp_difference/mean": 0.018636515364050865, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 228.640625, + "completions/mean_terminated_length": 228.640625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2560403347015381, + "epoch": 0.30514705882352944, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.388117291352842, + "kl": 0.01740189641714096, + "learning_rate": 9.99997970064382e-07, + "loss": -0.0331, + "num_tokens": 7635852.0, + "reward": 0.46875, + "reward_std": 0.6223389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9314196109771729, + "sampling/importance_sampling_ratio/mean": 1.00026273727417, + "sampling/importance_sampling_ratio/min": 0.48761385679244995, + "sampling/sampling_logp_difference/max": 0.7182314395904541, + "sampling/sampling_logp_difference/mean": 0.017072558403015137, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 178.921875, + "completions/mean_terminated_length": 178.921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2835567593574524, + "epoch": 0.30637254901960786, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.0005930201165896, + "kl": 0.02886389009654522, + "learning_rate": 9.999954326487227e-07, + "loss": 0.0229, + "num_tokens": 7662375.0, + "reward": 0.4375, + "reward_std": 0.5501632690429688, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.8641325235366821, + "sampling/importance_sampling_ratio/mean": 0.9994344711303711, + "sampling/importance_sampling_ratio/min": 0.5916401147842407, + "sampling/sampling_logp_difference/max": 0.622795820236206, + "sampling/sampling_logp_difference/mean": 0.017332665622234344, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 177.078125, + "completions/mean_terminated_length": 177.078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.19621199369430542, + "epoch": 0.3075980392156863, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.495969881060601, + "kl": 0.021376606076955795, + "learning_rate": 9.999918802740106e-07, + "loss": 0.0438, + "num_tokens": 7686092.0, + "reward": 0.875, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4804352521896362, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.5440237522125244, + "sampling/sampling_logp_difference/max": 0.6087623834609985, + "sampling/sampling_logp_difference/mean": 0.013395338319242, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 198.3125, + "completions/mean_terminated_length": 198.3125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.14947807788848877, + "epoch": 0.3088235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4286947189398673, + "kl": 0.016668811440467834, + "learning_rate": 9.999873129474573e-07, + "loss": 0.0134, + "num_tokens": 7718192.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999808669090271, + "sampling/importance_sampling_ratio/min": 0.4353545904159546, + "sampling/sampling_logp_difference/max": 0.8583614826202393, + "sampling/sampling_logp_difference/mean": 0.012718813493847847, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 233.1875, + "completions/mean_terminated_length": 233.1875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2590774595737457, + "epoch": 0.31004901960784315, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.371577105170613, + "kl": 0.018302420154213905, + "learning_rate": 9.999817306783336e-07, + "loss": 0.0031, + "num_tokens": 7748204.0, + "reward": 0.5, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6542383432388306, + "sampling/importance_sampling_ratio/mean": 0.9993072748184204, + "sampling/importance_sampling_ratio/min": 0.4976939857006073, + "sampling/sampling_logp_difference/max": 0.6977698802947998, + "sampling/sampling_logp_difference/mean": 0.015687517821788788, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 153.859375, + "completions/mean_terminated_length": 153.859375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.14650636911392212, + "epoch": 0.3112745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03853809124553455, + "kl": 0.016221893951296806, + "learning_rate": 9.999751334779714e-07, + "loss": 0.0002, + "num_tokens": 7772627.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000417947769165, + "sampling/importance_sampling_ratio/min": 0.4323920011520386, + "sampling/sampling_logp_difference/max": 1.1542718410491943, + "sampling/sampling_logp_difference/mean": 0.010418561287224293, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 151.9375, + "completions/mean_terminated_length": 151.9375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.22642837464809418, + "epoch": 0.3125, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4917039896100186, + "kl": 0.03227543085813522, + "learning_rate": 9.999675213597626e-07, + "loss": 0.0128, + "num_tokens": 7801695.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4883091449737549, + "sampling/importance_sampling_ratio/mean": 1.0000696182250977, + "sampling/importance_sampling_ratio/min": 0.3592665493488312, + "sampling/sampling_logp_difference/max": 1.0236907005310059, + "sampling/sampling_logp_difference/mean": 0.016973216086626053, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 180.265625, + "completions/mean_terminated_length": 180.265625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.18876762688159943, + "epoch": 0.3137254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3133944861645632, + "kl": 0.02303905412554741, + "learning_rate": 9.999588943391595e-07, + "loss": -0.0206, + "num_tokens": 7831184.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992313385009766, + "sampling/importance_sampling_ratio/min": 0.4441002309322357, + "sampling/sampling_logp_difference/max": 0.9530837535858154, + "sampling/sampling_logp_difference/mean": 0.014563923701643944, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 202.59375, + "completions/mean_terminated_length": 202.59375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.22054433822631836, + "epoch": 0.31495098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3198213503704428, + "kl": 0.03424456715583801, + "learning_rate": 9.999492524336742e-07, + "loss": -0.0358, + "num_tokens": 7858454.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6623640060424805, + "sampling/importance_sampling_ratio/mean": 0.9998559355735779, + "sampling/importance_sampling_ratio/min": 0.5483723282814026, + "sampling/sampling_logp_difference/max": 0.6008007526397705, + "sampling/sampling_logp_difference/mean": 0.013884920626878738, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 187.5, + "completions/mean_terminated_length": 187.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.23330222070217133, + "epoch": 0.3161764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.469206670551493, + "kl": 0.04369250684976578, + "learning_rate": 9.999385956628792e-07, + "loss": 0.0103, + "num_tokens": 7885478.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007914304733276, + "sampling/importance_sampling_ratio/min": 0.4159523844718933, + "sampling/sampling_logp_difference/max": 0.8771844506263733, + "sampling/sampling_logp_difference/mean": 0.016755102202296257, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 192.078125, + "completions/mean_terminated_length": 192.078125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.1585007905960083, + "epoch": 0.3174019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6558902324721223, + "kl": 0.045375652611255646, + "learning_rate": 9.999269240484069e-07, + "loss": -0.0616, + "num_tokens": 7915627.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9510289430618286, + "sampling/importance_sampling_ratio/mean": 1.0000808238983154, + "sampling/importance_sampling_ratio/min": 0.3604038953781128, + "sampling/sampling_logp_difference/max": 1.0205299854278564, + "sampling/sampling_logp_difference/mean": 0.0157431922852993, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 212.59375, + "completions/mean_terminated_length": 212.59375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.20303696393966675, + "epoch": 0.31862745098039214, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4575941251694466, + "kl": 0.029700467362999916, + "learning_rate": 9.999142376139503e-07, + "loss": 0.0139, + "num_tokens": 7949601.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002882480621338, + "sampling/importance_sampling_ratio/min": 0.4251917004585266, + "sampling/sampling_logp_difference/max": 0.8552151918411255, + "sampling/sampling_logp_difference/mean": 0.014961793087422848, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 142.90625, + "completions/mean_terminated_length": 142.90625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.15443193912506104, + "epoch": 0.31985294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07583212926367194, + "kl": 0.04153234511613846, + "learning_rate": 9.999005363852617e-07, + "loss": 0.0004, + "num_tokens": 7974603.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000119686126709, + "sampling/importance_sampling_ratio/min": 0.4441169500350952, + "sampling/sampling_logp_difference/max": 0.8772122859954834, + "sampling/sampling_logp_difference/mean": 0.012940686196088791, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 223.25, + "completions/mean_terminated_length": 223.25, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.1729099452495575, + "epoch": 0.32107843137254904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053998073517434415, + "kl": 0.02735218033194542, + "learning_rate": 9.99885820390154e-07, + "loss": 0.0003, + "num_tokens": 8008011.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006272792816162, + "sampling/importance_sampling_ratio/min": 0.16875389218330383, + "sampling/sampling_logp_difference/max": 1.7793139219284058, + "sampling/sampling_logp_difference/mean": 0.013979100622236729, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 240.171875, + "completions/mean_terminated_length": 240.171875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2443355768918991, + "epoch": 0.32230392156862747, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.43946002353813, + "kl": 0.026139382272958755, + "learning_rate": 9.998700896584995e-07, + "loss": 0.0006, + "num_tokens": 8043670.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8476462364196777, + "sampling/importance_sampling_ratio/mean": 1.000669002532959, + "sampling/importance_sampling_ratio/min": 0.4119199812412262, + "sampling/sampling_logp_difference/max": 0.8869261741638184, + "sampling/sampling_logp_difference/mean": 0.01667368784546852, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 290.015625, + "completions/mean_terminated_length": 290.015625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.23038527369499207, + "epoch": 0.3235294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.2449243314746616, + "kl": 0.054875560104846954, + "learning_rate": 9.998533442222308e-07, + "loss": 0.0311, + "num_tokens": 8081463.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998653531074524, + "sampling/importance_sampling_ratio/min": 0.0009191891294904053, + "sampling/sampling_logp_difference/max": 6.992018699645996, + "sampling/sampling_logp_difference/mean": 0.016055557876825333, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 256.875, + "completions/mean_terminated_length": 256.875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.16996632516384125, + "epoch": 0.3247549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.183405230529886, + "kl": 0.036977771669626236, + "learning_rate": 9.9983558411534e-07, + "loss": 0.0049, + "num_tokens": 8113151.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6553928852081299, + "sampling/importance_sampling_ratio/mean": 0.9998458027839661, + "sampling/importance_sampling_ratio/min": 0.4956169128417969, + "sampling/sampling_logp_difference/max": 0.7019519805908203, + "sampling/sampling_logp_difference/mean": 0.011236455291509628, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 196.109375, + "completions/mean_terminated_length": 196.109375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.1882980465888977, + "epoch": 0.32598039215686275, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6896309395623348, + "kl": 0.054088227450847626, + "learning_rate": 9.99816809373879e-07, + "loss": 0.0229, + "num_tokens": 8145990.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001240968704224, + "sampling/importance_sampling_ratio/min": 0.24144580960273743, + "sampling/sampling_logp_difference/max": 1.4211102724075317, + "sampling/sampling_logp_difference/mean": 0.01539707649499178, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 211.15625, + "completions/mean_terminated_length": 211.15625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.24172553420066833, + "epoch": 0.3272058823529412, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.4090101474163363, + "kl": 0.0678095892071724, + "learning_rate": 9.99797020035959e-07, + "loss": -0.0382, + "num_tokens": 8178928.0, + "reward": 0.15625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001152515411377, + "sampling/importance_sampling_ratio/min": 0.4870632290840149, + "sampling/sampling_logp_difference/max": 0.7327721118927002, + "sampling/sampling_logp_difference/mean": 0.017825480550527573, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 264.78125, + "completions/mean_terminated_length": 264.78125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.16782930493354797, + "epoch": 0.3284313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2461276023549521, + "kl": 0.021782714873552322, + "learning_rate": 9.997762161417517e-07, + "loss": -0.0767, + "num_tokens": 8213570.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000859260559082, + "sampling/importance_sampling_ratio/min": 0.3568447530269623, + "sampling/sampling_logp_difference/max": 1.0304545164108276, + "sampling/sampling_logp_difference/mean": 0.012895837426185608, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 275.84375, + "completions/mean_terminated_length": 275.84375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.187491774559021, + "epoch": 0.32965686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06064687623075509, + "kl": 0.035978008061647415, + "learning_rate": 9.997543977334873e-07, + "loss": 0.0004, + "num_tokens": 8256648.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998468160629272, + "sampling/importance_sampling_ratio/min": 0.37678810954093933, + "sampling/sampling_logp_difference/max": 0.9760723114013672, + "sampling/sampling_logp_difference/mean": 0.01476958580315113, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 216.703125, + "completions/mean_terminated_length": 216.703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.23430466651916504, + "epoch": 0.33088235294117646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06133336236799135, + "kl": 0.02953743189573288, + "learning_rate": 9.99731564855456e-07, + "loss": 0.0003, + "num_tokens": 8287189.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994462728500366, + "sampling/importance_sampling_ratio/min": 0.3390989899635315, + "sampling/sampling_logp_difference/max": 1.0814632177352905, + "sampling/sampling_logp_difference/mean": 0.0169600211083889, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 307.265625, + "completions/mean_terminated_length": 307.265625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.14084765315055847, + "epoch": 0.3321078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048327314027760665, + "kl": 0.02697977051138878, + "learning_rate": 9.997077175540066e-07, + "loss": 0.0002, + "num_tokens": 8326342.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.913650393486023, + "sampling/importance_sampling_ratio/mean": 1.000585675239563, + "sampling/importance_sampling_ratio/min": 0.47834527492523193, + "sampling/sampling_logp_difference/max": 0.7374224662780762, + "sampling/sampling_logp_difference/mean": 0.012069791555404663, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 183.015625, + "completions/mean_terminated_length": 183.015625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.18567748367786407, + "epoch": 0.3333333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0970298499771554, + "kl": 0.0578538179397583, + "learning_rate": 9.996828558775485e-07, + "loss": 0.0005, + "num_tokens": 8358983.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7512283325195312, + "sampling/importance_sampling_ratio/mean": 0.9999632835388184, + "sampling/importance_sampling_ratio/min": 0.4345874786376953, + "sampling/sampling_logp_difference/max": 0.8333580493927002, + "sampling/sampling_logp_difference/mean": 0.014503656886518002, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 224.859375, + "completions/mean_terminated_length": 224.859375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.20408973097801208, + "epoch": 0.33455882352941174, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1503994156290938, + "kl": 0.0713062733411789, + "learning_rate": 9.996569798765487e-07, + "loss": 0.0104, + "num_tokens": 8388094.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.8720554113388062, + "sampling/importance_sampling_ratio/mean": 1.0004630088806152, + "sampling/importance_sampling_ratio/min": 0.38671213388442993, + "sampling/sampling_logp_difference/max": 0.9500746726989746, + "sampling/sampling_logp_difference/mean": 0.014154992997646332, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 217.65625, + "completions/mean_terminated_length": 217.65625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.19104424118995667, + "epoch": 0.33578431372549017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059019722270121706, + "kl": 0.027294889092445374, + "learning_rate": 9.996300896035338e-07, + "loss": 0.0003, + "num_tokens": 8416872.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.745153784751892, + "sampling/importance_sampling_ratio/mean": 0.9995296001434326, + "sampling/importance_sampling_ratio/min": 0.3825327455997467, + "sampling/sampling_logp_difference/max": 0.9609410762786865, + "sampling/sampling_logp_difference/mean": 0.013582659885287285, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 261.53125, + "completions/mean_terminated_length": 261.53125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.15209710597991943, + "epoch": 0.33700980392156865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05488378670977295, + "kl": 0.024721980094909668, + "learning_rate": 9.996021851130896e-07, + "loss": 0.0003, + "num_tokens": 8451658.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9497171640396118, + "sampling/importance_sampling_ratio/mean": 1.0000152587890625, + "sampling/importance_sampling_ratio/min": 0.23663154244422913, + "sampling/sampling_logp_difference/max": 1.4412510395050049, + "sampling/sampling_logp_difference/mean": 0.012828035280108452, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 211.59375, + "completions/mean_terminated_length": 211.59375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.1754235327243805, + "epoch": 0.3382352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06723153773365091, + "kl": 0.03830999881029129, + "learning_rate": 9.995732664618603e-07, + "loss": 0.0004, + "num_tokens": 8497664.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.3859061598777771, + "sampling/sampling_logp_difference/max": 0.9521610736846924, + "sampling/sampling_logp_difference/mean": 0.015094295144081116, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 288.453125, + "completions/mean_terminated_length": 288.453125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.16961298882961273, + "epoch": 0.3394607843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5767552367347628, + "kl": 0.039779193699359894, + "learning_rate": 9.99543333708549e-07, + "loss": 0.0023, + "num_tokens": 8532941.0, + "reward": 0.03125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998610019683838, + "sampling/importance_sampling_ratio/min": 0.17132364213466644, + "sampling/sampling_logp_difference/max": 1.7642008066177368, + "sampling/sampling_logp_difference/mean": 0.012929710559546947, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 206.265625, + "completions/mean_terminated_length": 206.265625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.1733682006597519, + "epoch": 0.34068627450980393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05794989946174121, + "kl": 0.0274711512029171, + "learning_rate": 9.995123869139176e-07, + "loss": 0.0002, + "num_tokens": 8559566.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992111921310425, + "sampling/importance_sampling_ratio/min": 0.10852707177400589, + "sampling/sampling_logp_difference/max": 2.2207555770874023, + "sampling/sampling_logp_difference/mean": 0.014123711735010147, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 234.46875, + "completions/mean_terminated_length": 234.46875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.23664844036102295, + "epoch": 0.34191176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0550953417439475, + "kl": 0.03200116753578186, + "learning_rate": 9.994804261407854e-07, + "loss": 0.0003, + "num_tokens": 8600412.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7216484546661377, + "sampling/importance_sampling_ratio/mean": 1.0006502866744995, + "sampling/importance_sampling_ratio/min": 0.3945484459400177, + "sampling/sampling_logp_difference/max": 0.9300134181976318, + "sampling/sampling_logp_difference/mean": 0.017763927578926086, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 138.734375, + "completions/mean_terminated_length": 138.734375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.18176689743995667, + "epoch": 0.3431372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5866845245694385, + "kl": 0.0721161812543869, + "learning_rate": 9.994474514540312e-07, + "loss": 0.0195, + "num_tokens": 8632811.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999579191207886, + "sampling/importance_sampling_ratio/min": 0.5019660592079163, + "sampling/sampling_logp_difference/max": 0.7264273166656494, + "sampling/sampling_logp_difference/mean": 0.01411209162324667, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 331.859375, + "completions/mean_terminated_length": 331.859375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.18779036402702332, + "epoch": 0.3443627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8192113355382399, + "kl": 0.04017651081085205, + "learning_rate": 9.994134629205917e-07, + "loss": -0.0012, + "num_tokens": 8671138.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.7152165174484253, + "sampling/importance_sampling_ratio/mean": 1.0000743865966797, + "sampling/importance_sampling_ratio/min": 0.3180188238620758, + "sampling/sampling_logp_difference/max": 1.1456446647644043, + "sampling/sampling_logp_difference/mean": 0.012649483978748322, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1427.0, + "completions/max_terminated_length": 1427.0, + "completions/mean_length": 379.546875, + "completions/mean_terminated_length": 379.546875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.14155760407447815, + "epoch": 0.34558823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07669496214440431, + "kl": 0.03512256219983101, + "learning_rate": 9.99378460609461e-07, + "loss": 0.0002, + "num_tokens": 8709973.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.752143383026123, + "sampling/importance_sampling_ratio/mean": 0.9998910427093506, + "sampling/importance_sampling_ratio/min": 0.48715895414352417, + "sampling/sampling_logp_difference/max": 0.7191648483276367, + "sampling/sampling_logp_difference/mean": 0.011532300151884556, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 332.921875, + "completions/mean_terminated_length": 332.921875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.26236775517463684, + "epoch": 0.34681372549019607, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4283450163176932, + "kl": 0.034624628722667694, + "learning_rate": 9.993424445916922e-07, + "loss": 0.008, + "num_tokens": 8748688.0, + "reward": -0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.9392987489700317, + "sampling/importance_sampling_ratio/mean": 0.9999065399169922, + "sampling/importance_sampling_ratio/min": 0.43108800053596497, + "sampling/sampling_logp_difference/max": 0.8414430618286133, + "sampling/sampling_logp_difference/mean": 0.01542898640036583, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 183.96875, + "completions/mean_terminated_length": 183.96875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2028695046901703, + "epoch": 0.3480392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07872260060535358, + "kl": 0.029936885461211205, + "learning_rate": 9.993054149403949e-07, + "loss": 0.0003, + "num_tokens": 8777182.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003210306167603, + "sampling/importance_sampling_ratio/min": 0.6080567240715027, + "sampling/sampling_logp_difference/max": 0.7567486763000488, + "sampling/sampling_logp_difference/mean": 0.014912188053131104, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 179.4375, + "completions/mean_terminated_length": 179.4375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2501886785030365, + "epoch": 0.3492647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.352352338282128, + "kl": 0.04442233592271805, + "learning_rate": 9.992673717307372e-07, + "loss": -0.0322, + "num_tokens": 8805418.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996594786643982, + "sampling/importance_sampling_ratio/min": 0.4986007809638977, + "sampling/sampling_logp_difference/max": 1.569200038909912, + "sampling/sampling_logp_difference/mean": 0.016398219391703606, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 317.25, + "completions/mean_terminated_length": 317.25, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.25562727451324463, + "epoch": 0.35049019607843135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03983127866431625, + "kl": 0.01921760104596615, + "learning_rate": 9.992283150399446e-07, + "loss": 0.0002, + "num_tokens": 8845498.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999561071395874, + "sampling/importance_sampling_ratio/min": 0.41906750202178955, + "sampling/sampling_logp_difference/max": 0.8697233200073242, + "sampling/sampling_logp_difference/mean": 0.014112534001469612, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 181.78125, + "completions/mean_terminated_length": 181.78125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.1340177357196808, + "epoch": 0.35171568627450983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09420534167407238, + "kl": 0.04165516048669815, + "learning_rate": 9.991882449472994e-07, + "loss": 0.0004, + "num_tokens": 8871020.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9570924043655396, + "sampling/importance_sampling_ratio/mean": 0.9995983839035034, + "sampling/importance_sampling_ratio/min": 0.3091437816619873, + "sampling/sampling_logp_difference/max": 1.1739487648010254, + "sampling/sampling_logp_difference/mean": 0.011645602062344551, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 302.84375, + "completions/mean_terminated_length": 302.84375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.16266974806785583, + "epoch": 0.35294117647058826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07678208797274977, + "kl": 0.03029218316078186, + "learning_rate": 9.991471615341415e-07, + "loss": 0.0003, + "num_tokens": 8908882.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000216960906982, + "sampling/importance_sampling_ratio/min": 0.5609308481216431, + "sampling/sampling_logp_difference/max": 0.7711536884307861, + "sampling/sampling_logp_difference/mean": 0.010535717010498047, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 297.375, + "completions/mean_terminated_length": 297.375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.16066837310791016, + "epoch": 0.3541666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0617927358175623, + "kl": 0.02349277213215828, + "learning_rate": 9.991050648838675e-07, + "loss": 0.0104, + "num_tokens": 8953658.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8624604940414429, + "sampling/importance_sampling_ratio/mean": 1.0008823871612549, + "sampling/importance_sampling_ratio/min": 0.2730712294578552, + "sampling/sampling_logp_difference/max": 1.2980226278305054, + "sampling/sampling_logp_difference/mean": 0.012821042910218239, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 242.0, + "completions/mean_terminated_length": 242.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.22517183423042297, + "epoch": 0.3553921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8833675798052343, + "kl": 0.03744828328490257, + "learning_rate": 9.990619550819312e-07, + "loss": 0.0569, + "num_tokens": 8989802.0, + "reward": 0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999470114707947, + "sampling/importance_sampling_ratio/min": 0.439327210187912, + "sampling/sampling_logp_difference/max": 0.8553783893585205, + "sampling/sampling_logp_difference/mean": 0.014835933223366737, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 264.34375, + "completions/mean_terminated_length": 264.34375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.17973962426185608, + "epoch": 0.35661764705882354, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1682813575991569, + "kl": 0.04859580099582672, + "learning_rate": 9.990178322158424e-07, + "loss": -0.005, + "num_tokens": 9022816.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8273805379867554, + "sampling/importance_sampling_ratio/mean": 0.9987258315086365, + "sampling/importance_sampling_ratio/min": 0.41369491815567017, + "sampling/sampling_logp_difference/max": 0.8826265335083008, + "sampling/sampling_logp_difference/mean": 0.014202505350112915, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 268.03125, + "completions/mean_terminated_length": 268.03125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.21925997734069824, + "epoch": 0.35784313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07576893404582719, + "kl": 0.03912314772605896, + "learning_rate": 9.989726963751682e-07, + "loss": 0.0003, + "num_tokens": 9061986.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995810985565186, + "sampling/importance_sampling_ratio/min": 0.45204266905784607, + "sampling/sampling_logp_difference/max": 0.9608368873596191, + "sampling/sampling_logp_difference/mean": 0.01378319039940834, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 278.234375, + "completions/mean_terminated_length": 278.234375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.17324486374855042, + "epoch": 0.3590686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8174386073848596, + "kl": 0.031908657401800156, + "learning_rate": 9.989265476515309e-07, + "loss": -0.0612, + "num_tokens": 9099761.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.9097315073013306, + "sampling/importance_sampling_ratio/mean": 1.000276803970337, + "sampling/importance_sampling_ratio/min": 0.2784660756587982, + "sampling/sampling_logp_difference/max": 1.278459072113037, + "sampling/sampling_logp_difference/mean": 0.01174275204539299, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1447.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 348.5, + "completions/mean_terminated_length": 348.5, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.2010985016822815, + "epoch": 0.3602941176470588, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1588447696198672, + "kl": 0.02160612866282463, + "learning_rate": 9.9887938613861e-07, + "loss": 0.0023, + "num_tokens": 9143521.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.87212336063385, + "sampling/importance_sampling_ratio/mean": 1.0001239776611328, + "sampling/importance_sampling_ratio/min": 0.44800442457199097, + "sampling/sampling_logp_difference/max": 0.802952229976654, + "sampling/sampling_logp_difference/mean": 0.012264113873243332, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 212.765625, + "completions/mean_terminated_length": 212.765625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2683764100074768, + "epoch": 0.36151960784313725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07303974922055913, + "kl": 0.04338815063238144, + "learning_rate": 9.988312119321402e-07, + "loss": 0.0004, + "num_tokens": 9172114.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6276811361312866, + "sampling/importance_sampling_ratio/mean": 1.0005288124084473, + "sampling/importance_sampling_ratio/min": 0.5363470911979675, + "sampling/sampling_logp_difference/max": 0.6229737997055054, + "sampling/sampling_logp_difference/mean": 0.015231205150485039, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 258.796875, + "completions/mean_terminated_length": 258.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.26148664951324463, + "epoch": 0.3627450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1008567391160247, + "kl": 0.03650952875614166, + "learning_rate": 9.98782025129912e-07, + "loss": 0.019, + "num_tokens": 9205589.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9441547393798828, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 0.4872017800807953, + "sampling/sampling_logp_difference/max": 0.7190768718719482, + "sampling/sampling_logp_difference/mean": 0.014916934072971344, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1169.0, + "completions/max_terminated_length": 1169.0, + "completions/mean_length": 223.484375, + "completions/mean_terminated_length": 223.484375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2084149718284607, + "epoch": 0.3639705882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.926986647314951, + "kl": 0.03650977462530136, + "learning_rate": 9.987318258317715e-07, + "loss": 0.0132, + "num_tokens": 9235124.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8333377838134766, + "sampling/importance_sampling_ratio/mean": 1.0002769231796265, + "sampling/importance_sampling_ratio/min": 0.5615825057029724, + "sampling/sampling_logp_difference/max": 0.6061382293701172, + "sampling/sampling_logp_difference/mean": 0.01239529624581337, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 185.703125, + "completions/mean_terminated_length": 185.703125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.21163314580917358, + "epoch": 0.36519607843137253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07274281159547615, + "kl": 0.03799588233232498, + "learning_rate": 9.986806141396205e-07, + "loss": 0.0004, + "num_tokens": 9264353.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.944161295890808, + "sampling/importance_sampling_ratio/mean": 1.000054121017456, + "sampling/importance_sampling_ratio/min": 0.4457568824291229, + "sampling/sampling_logp_difference/max": 0.8079816102981567, + "sampling/sampling_logp_difference/mean": 0.014065688475966454, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 218.78125, + "completions/mean_terminated_length": 218.78125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2359258532524109, + "epoch": 0.36642156862745096, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.229085798192281, + "kl": 0.05050275847315788, + "learning_rate": 9.986283901574149e-07, + "loss": -0.025, + "num_tokens": 9294499.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994552135467529, + "sampling/importance_sampling_ratio/min": 0.48804524540901184, + "sampling/sampling_logp_difference/max": 0.7173471450805664, + "sampling/sampling_logp_difference/mean": 0.015633976086974144, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 213.6875, + "completions/mean_terminated_length": 213.6875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.2758669853210449, + "epoch": 0.36764705882352944, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2045180532016053, + "kl": 0.04507840797305107, + "learning_rate": 9.985751539911664e-07, + "loss": 0.0005, + "num_tokens": 9326383.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6551718711853027, + "sampling/importance_sampling_ratio/mean": 1.0001018047332764, + "sampling/importance_sampling_ratio/min": 0.5446324348449707, + "sampling/sampling_logp_difference/max": 0.6076440811157227, + "sampling/sampling_logp_difference/mean": 0.014851532876491547, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 217.65625, + "completions/mean_terminated_length": 217.65625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.2571743130683899, + "epoch": 0.36887254901960786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078752455404774, + "kl": 0.04069396108388901, + "learning_rate": 9.985209057489408e-07, + "loss": 0.0004, + "num_tokens": 9359049.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8130477666854858, + "sampling/importance_sampling_ratio/mean": 1.0003347396850586, + "sampling/importance_sampling_ratio/min": 0.4874882102012634, + "sampling/sampling_logp_difference/max": 0.7184891700744629, + "sampling/sampling_logp_difference/mean": 0.015300736762583256, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 194.171875, + "completions/mean_terminated_length": 194.171875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.21515023708343506, + "epoch": 0.3700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.757122950918929, + "kl": 0.054561879485845566, + "learning_rate": 9.98465645540859e-07, + "loss": 0.0068, + "num_tokens": 9387956.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000688910484314, + "sampling/importance_sampling_ratio/min": 0.38900381326675415, + "sampling/sampling_logp_difference/max": 0.9441661834716797, + "sampling/sampling_logp_difference/mean": 0.01539285946637392, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 244.0625, + "completions/mean_terminated_length": 244.0625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2658084034919739, + "epoch": 0.3713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046382865558398226, + "kl": 0.043975912034511566, + "learning_rate": 9.984093734790954e-07, + "loss": 0.0004, + "num_tokens": 9423800.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8894855976104736, + "sampling/importance_sampling_ratio/mean": 0.999910831451416, + "sampling/importance_sampling_ratio/min": 0.6174265742301941, + "sampling/sampling_logp_difference/max": 0.6363046169281006, + "sampling/sampling_logp_difference/mean": 0.015923671424388885, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 206.28125, + "completions/mean_terminated_length": 206.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.24521231651306152, + "epoch": 0.37254901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06401306952952608, + "kl": 0.05235999822616577, + "learning_rate": 9.983520896778788e-07, + "loss": 0.0005, + "num_tokens": 9461882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997357726097107, + "sampling/importance_sampling_ratio/min": 0.380566269159317, + "sampling/sampling_logp_difference/max": 0.966094970703125, + "sampling/sampling_logp_difference/mean": 0.016459612175822258, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 162.828125, + "completions/mean_terminated_length": 162.828125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2646932899951935, + "epoch": 0.3737745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08126708221862732, + "kl": 0.09018871188163757, + "learning_rate": 9.982937942534917e-07, + "loss": 0.0009, + "num_tokens": 9488943.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002355575561523, + "sampling/importance_sampling_ratio/min": 0.2654292583465576, + "sampling/sampling_logp_difference/max": 1.3264069557189941, + "sampling/sampling_logp_difference/mean": 0.016122879460453987, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 179.53125, + "completions/mean_terminated_length": 179.53125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.24329300224781036, + "epoch": 0.375, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4688878428154875, + "kl": 0.07533843070268631, + "learning_rate": 9.982344873242701e-07, + "loss": 0.0022, + "num_tokens": 9516065.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.5966145992279053, + "sampling/importance_sampling_ratio/mean": 1.000145673751831, + "sampling/importance_sampling_ratio/min": 0.6077024936676025, + "sampling/sampling_logp_difference/max": 0.49806976318359375, + "sampling/sampling_logp_difference/mean": 0.014692382887005806, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 172.265625, + "completions/mean_terminated_length": 172.265625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.1821404993534088, + "epoch": 0.3762254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054276720759989065, + "kl": 0.047477252781391144, + "learning_rate": 9.981741690106034e-07, + "loss": 0.0005, + "num_tokens": 9546386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002491474151611, + "sampling/importance_sampling_ratio/min": 0.6056228876113892, + "sampling/sampling_logp_difference/max": 0.7309679985046387, + "sampling/sampling_logp_difference/mean": 0.012202339246869087, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 209.234375, + "completions/mean_terminated_length": 209.234375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.18036088347434998, + "epoch": 0.37745098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037200787937837765, + "kl": 0.0318170040845871, + "learning_rate": 9.981128394349337e-07, + "loss": 0.0003, + "num_tokens": 9576913.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8171582221984863, + "sampling/importance_sampling_ratio/mean": 1.000547170639038, + "sampling/importance_sampling_ratio/min": 0.5328795313835144, + "sampling/sampling_logp_difference/max": 0.6294598579406738, + "sampling/sampling_logp_difference/mean": 0.012505259364843369, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 199.28125, + "completions/mean_terminated_length": 199.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2590799629688263, + "epoch": 0.3786764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.412172841260048, + "kl": 0.05636392906308174, + "learning_rate": 9.980504987217566e-07, + "loss": -0.0429, + "num_tokens": 9604531.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002868175506592, + "sampling/importance_sampling_ratio/min": 0.6122801899909973, + "sampling/sampling_logp_difference/max": 1.017359733581543, + "sampling/sampling_logp_difference/mean": 0.01534392312169075, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 252.109375, + "completions/mean_terminated_length": 252.109375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2604590058326721, + "epoch": 0.3799019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14569476673964082, + "kl": 0.06339927017688751, + "learning_rate": 9.979871469976195e-07, + "loss": 0.0006, + "num_tokens": 9642266.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999227523803711, + "sampling/importance_sampling_ratio/min": 0.5187325477600098, + "sampling/sampling_logp_difference/max": 0.7288380861282349, + "sampling/sampling_logp_difference/mean": 0.015960384160280228, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 159.921875, + "completions/mean_terminated_length": 159.921875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.23250165581703186, + "epoch": 0.38112745098039214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0633905125342962, + "kl": 0.061814285814762115, + "learning_rate": 9.979227843911224e-07, + "loss": 0.0006, + "num_tokens": 9674741.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4908007383346558, + "sampling/importance_sampling_ratio/mean": 0.9994626045227051, + "sampling/importance_sampling_ratio/min": 0.5913910269737244, + "sampling/sampling_logp_difference/max": 0.5252777934074402, + "sampling/sampling_logp_difference/mean": 0.014160791411995888, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 224.90625, + "completions/mean_terminated_length": 224.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2519511580467224, + "epoch": 0.38235294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4181004390709577, + "kl": 0.04638572037220001, + "learning_rate": 9.978574110329172e-07, + "loss": -0.0059, + "num_tokens": 9712223.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.9753763675689697, + "sampling/importance_sampling_ratio/mean": 1.0001909732818604, + "sampling/importance_sampling_ratio/min": 0.4974619448184967, + "sampling/sampling_logp_difference/max": 0.6982362270355225, + "sampling/sampling_logp_difference/mean": 0.014956100843846798, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 181.15625, + "completions/mean_terminated_length": 181.15625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.24080537259578705, + "epoch": 0.38357843137254904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06269812108988729, + "kl": 0.05156532675027847, + "learning_rate": 9.977910270557078e-07, + "loss": 0.0005, + "num_tokens": 9745305.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7569202184677124, + "sampling/importance_sampling_ratio/mean": 0.9995178580284119, + "sampling/importance_sampling_ratio/min": 0.42265576124191284, + "sampling/sampling_logp_difference/max": 0.8611972332000732, + "sampling/sampling_logp_difference/mean": 0.016552705317735672, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 213.671875, + "completions/mean_terminated_length": 213.671875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.24672749638557434, + "epoch": 0.38480392156862747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038710724119157124, + "kl": 0.03980868682265282, + "learning_rate": 9.977236325942497e-07, + "loss": 0.0004, + "num_tokens": 9779444.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7142006158828735, + "sampling/importance_sampling_ratio/mean": 0.9995484948158264, + "sampling/importance_sampling_ratio/min": 0.5472590327262878, + "sampling/sampling_logp_difference/max": 0.6028330326080322, + "sampling/sampling_logp_difference/mean": 0.014922859147191048, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 203.515625, + "completions/mean_terminated_length": 203.515625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3034396767616272, + "epoch": 0.3860294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4020050860954343, + "kl": 0.0434013307094574, + "learning_rate": 9.97655227785349e-07, + "loss": -0.003, + "num_tokens": 9809765.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5903964042663574, + "sampling/importance_sampling_ratio/mean": 1.0003769397735596, + "sampling/importance_sampling_ratio/min": 0.4204670190811157, + "sampling/sampling_logp_difference/max": 0.866389274597168, + "sampling/sampling_logp_difference/mean": 0.016428284347057343, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 223.921875, + "completions/mean_terminated_length": 223.921875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.22055089473724365, + "epoch": 0.3872549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06897500649119237, + "kl": 0.04714478179812431, + "learning_rate": 9.975858127678633e-07, + "loss": 0.0004, + "num_tokens": 9843376.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.663209319114685, + "sampling/importance_sampling_ratio/mean": 1.000046730041504, + "sampling/importance_sampling_ratio/min": 0.6187101006507874, + "sampling/sampling_logp_difference/max": 0.5087490081787109, + "sampling/sampling_logp_difference/mean": 0.01338781975209713, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 186.484375, + "completions/mean_terminated_length": 186.484375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.28117749094963074, + "epoch": 0.38848039215686275, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1723249204037125, + "kl": 0.06439540535211563, + "learning_rate": 9.975153876827007e-07, + "loss": 0.0153, + "num_tokens": 9873151.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6037282943725586, + "sampling/importance_sampling_ratio/mean": 0.9997469186782837, + "sampling/importance_sampling_ratio/min": 0.262337863445282, + "sampling/sampling_logp_difference/max": 1.338122010231018, + "sampling/sampling_logp_difference/mean": 0.01560327596962452, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 174.953125, + "completions/mean_terminated_length": 174.953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.1813991814851761, + "epoch": 0.3897058823529412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09730586734867665, + "kl": 0.049150414764881134, + "learning_rate": 9.974439526728196e-07, + "loss": 0.0005, + "num_tokens": 9902892.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8558018207550049, + "sampling/importance_sampling_ratio/mean": 0.999718427658081, + "sampling/importance_sampling_ratio/min": 0.5134551525115967, + "sampling/sampling_logp_difference/max": 0.6665925979614258, + "sampling/sampling_logp_difference/mean": 0.012492336332798004, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 184.515625, + "completions/mean_terminated_length": 184.515625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.25537946820259094, + "epoch": 0.3909313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048250038071933725, + "kl": 0.04358714073896408, + "learning_rate": 9.973715078832286e-07, + "loss": 0.0004, + "num_tokens": 9930685.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6917471885681152, + "sampling/importance_sampling_ratio/mean": 1.0004292726516724, + "sampling/importance_sampling_ratio/min": 0.4916207194328308, + "sampling/sampling_logp_difference/max": 0.710047721862793, + "sampling/sampling_logp_difference/mean": 0.015268863178789616, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 143.734375, + "completions/mean_terminated_length": 143.734375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.23418602347373962, + "epoch": 0.39215686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08301223920073554, + "kl": 0.06582756340503693, + "learning_rate": 9.97298053460986e-07, + "loss": 0.0006, + "num_tokens": 9958860.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7450684309005737, + "sampling/importance_sampling_ratio/mean": 0.999579668045044, + "sampling/importance_sampling_ratio/min": 0.17664504051208496, + "sampling/sampling_logp_difference/max": 1.7336130142211914, + "sampling/sampling_logp_difference/mean": 0.014755094423890114, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 161.875, + "completions/mean_terminated_length": 161.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.213094100356102, + "epoch": 0.39338235294117646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07286969598929685, + "kl": 0.06171388551592827, + "learning_rate": 9.972235895552e-07, + "loss": 0.0006, + "num_tokens": 9983716.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.597754716873169, + "sampling/importance_sampling_ratio/mean": 1.000230312347412, + "sampling/importance_sampling_ratio/min": 0.4993267059326172, + "sampling/sampling_logp_difference/max": 0.6944947242736816, + "sampling/sampling_logp_difference/mean": 0.013996414840221405, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 182.765625, + "completions/mean_terminated_length": 182.765625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.28134018182754517, + "epoch": 0.3946078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3561961079152003, + "kl": 0.07659714668989182, + "learning_rate": 9.971481163170269e-07, + "loss": 0.0083, + "num_tokens": 10016693.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996200203895569, + "sampling/importance_sampling_ratio/min": 0.15386080741882324, + "sampling/sampling_logp_difference/max": 1.8717069625854492, + "sampling/sampling_logp_difference/mean": 0.017205171287059784, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 240.359375, + "completions/mean_terminated_length": 240.359375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.19475317001342773, + "epoch": 0.3958333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03885232754500029, + "kl": 0.039011888206005096, + "learning_rate": 9.97071633899673e-07, + "loss": 0.0004, + "num_tokens": 10047308.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9393295049667358, + "sampling/importance_sampling_ratio/mean": 0.9997988343238831, + "sampling/importance_sampling_ratio/min": 0.4440854787826538, + "sampling/sampling_logp_difference/max": 0.8117382526397705, + "sampling/sampling_logp_difference/mean": 0.011159185320138931, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 177.46875, + "completions/mean_terminated_length": 177.46875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.26126256585121155, + "epoch": 0.39705882352941174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056674788480339515, + "kl": 0.05115184187889099, + "learning_rate": 9.969941424583925e-07, + "loss": 0.0005, + "num_tokens": 10079914.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8065956830978394, + "sampling/importance_sampling_ratio/mean": 0.9991174936294556, + "sampling/importance_sampling_ratio/min": 0.3691619634628296, + "sampling/sampling_logp_difference/max": 0.9965198040008545, + "sampling/sampling_logp_difference/mean": 0.017451079562306404, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 162.140625, + "completions/mean_terminated_length": 162.140625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.28593000769615173, + "epoch": 0.39828431372549017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16347853531304926, + "kl": 0.06763357669115067, + "learning_rate": 9.969156421504887e-07, + "loss": 0.0006, + "num_tokens": 10109251.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9765565395355225, + "sampling/importance_sampling_ratio/mean": 1.0008492469787598, + "sampling/importance_sampling_ratio/min": 0.22377948462963104, + "sampling/sampling_logp_difference/max": 1.4970941543579102, + "sampling/sampling_logp_difference/mean": 0.01835448667407036, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 142.890625, + "completions/mean_terminated_length": 142.890625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.237276092171669, + "epoch": 0.39950980392156865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060328620195097, + "kl": 0.06441694498062134, + "learning_rate": 9.968361331353116e-07, + "loss": 0.0006, + "num_tokens": 10132060.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6106983423233032, + "sampling/importance_sampling_ratio/mean": 1.000266671180725, + "sampling/importance_sampling_ratio/min": 0.22330020368099213, + "sampling/sampling_logp_difference/max": 1.4992382526397705, + "sampling/sampling_logp_difference/mean": 0.014961745589971542, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 179.078125, + "completions/mean_terminated_length": 179.078125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.29044216871261597, + "epoch": 0.4007352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061437390410855684, + "kl": 0.06264321506023407, + "learning_rate": 9.9675561557426e-07, + "loss": 0.0006, + "num_tokens": 10164001.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.732211947441101, + "sampling/importance_sampling_ratio/mean": 0.9993464946746826, + "sampling/importance_sampling_ratio/min": 0.47762712836265564, + "sampling/sampling_logp_difference/max": 0.7389249801635742, + "sampling/sampling_logp_difference/mean": 0.017121639102697372, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 197.640625, + "completions/mean_terminated_length": 197.640625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.28534969687461853, + "epoch": 0.4019607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.551587661231534, + "kl": 0.05487293004989624, + "learning_rate": 9.966740896307791e-07, + "loss": 0.0133, + "num_tokens": 10196522.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6065754890441895, + "sampling/importance_sampling_ratio/mean": 1.0002343654632568, + "sampling/importance_sampling_ratio/min": 0.5187581181526184, + "sampling/sampling_logp_difference/max": 0.6563175320625305, + "sampling/sampling_logp_difference/mean": 0.01480232086032629, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 187.9375, + "completions/mean_terminated_length": 187.9375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.23456662893295288, + "epoch": 0.40318627450980393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0676532720442417, + "kl": 0.05448365956544876, + "learning_rate": 9.965915554703613e-07, + "loss": 0.0005, + "num_tokens": 10222662.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6160944700241089, + "sampling/importance_sampling_ratio/mean": 0.9998668432235718, + "sampling/importance_sampling_ratio/min": 0.6147313714027405, + "sampling/sampling_logp_difference/max": 0.486569881439209, + "sampling/sampling_logp_difference/mean": 0.012745749205350876, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 185.9375, + "completions/mean_terminated_length": 185.9375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.28205636143684387, + "epoch": 0.40441176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04557906960290233, + "kl": 0.04451793432235718, + "learning_rate": 9.965080132605461e-07, + "loss": 0.0004, + "num_tokens": 10252466.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7039932012557983, + "sampling/importance_sampling_ratio/mean": 0.9997226595878601, + "sampling/importance_sampling_ratio/min": 0.6168078780174255, + "sampling/sampling_logp_difference/max": 0.5329744815826416, + "sampling/sampling_logp_difference/mean": 0.014937239699065685, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 187.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.24828140437602997, + "epoch": 0.4056372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06048318618681636, + "kl": 0.043864063918590546, + "learning_rate": 9.964234631709185e-07, + "loss": 0.0004, + "num_tokens": 10284970.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997553825378418, + "sampling/importance_sampling_ratio/min": 0.5263671278953552, + "sampling/sampling_logp_difference/max": 0.7147336006164551, + "sampling/sampling_logp_difference/mean": 0.015005389228463173, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 211.484375, + "completions/mean_terminated_length": 211.484375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.21704155206680298, + "epoch": 0.4068627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04417146396250178, + "kl": 0.046004392206668854, + "learning_rate": 9.963379053731102e-07, + "loss": 0.0004, + "num_tokens": 10314825.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999356269836426, + "sampling/importance_sampling_ratio/min": 0.5363428592681885, + "sampling/sampling_logp_difference/max": 0.7130594253540039, + "sampling/sampling_logp_difference/mean": 0.01378547865897417, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 167.921875, + "completions/mean_terminated_length": 167.921875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2341511845588684, + "epoch": 0.40808823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05261681014748957, + "kl": 0.04148735851049423, + "learning_rate": 9.96251340040798e-07, + "loss": 0.0004, + "num_tokens": 10342644.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.794343113899231, + "sampling/importance_sampling_ratio/mean": 1.0001440048217773, + "sampling/importance_sampling_ratio/min": 0.6222392916679382, + "sampling/sampling_logp_difference/max": 0.5846390724182129, + "sampling/sampling_logp_difference/mean": 0.014835352078080177, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 163.359375, + "completions/mean_terminated_length": 163.359375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.24428749084472656, + "epoch": 0.40931372549019607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05334254161930812, + "kl": 0.048776090145111084, + "learning_rate": 9.96163767349704e-07, + "loss": 0.0005, + "num_tokens": 10373755.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9219019412994385, + "sampling/importance_sampling_ratio/mean": 1.0004420280456543, + "sampling/importance_sampling_ratio/min": 0.4739871621131897, + "sampling/sampling_logp_difference/max": 0.7465750575065613, + "sampling/sampling_logp_difference/mean": 0.015613086521625519, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 235.828125, + "completions/mean_terminated_length": 235.828125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.31158655881881714, + "epoch": 0.4105392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08802832666554632, + "kl": 0.05691176652908325, + "learning_rate": 9.96075187477595e-07, + "loss": 0.0005, + "num_tokens": 10406544.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6408649682998657, + "sampling/importance_sampling_ratio/mean": 0.9993789792060852, + "sampling/importance_sampling_ratio/min": 0.4982425272464752, + "sampling/sampling_logp_difference/max": 0.6966683864593506, + "sampling/sampling_logp_difference/mean": 0.017097072675824165, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 137.703125, + "completions/mean_terminated_length": 137.703125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.19279997050762177, + "epoch": 0.4117647058823529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494635031712655, + "kl": 0.05286743491888046, + "learning_rate": 9.959856006042828e-07, + "loss": 0.0005, + "num_tokens": 10433357.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7472764253616333, + "sampling/importance_sampling_ratio/mean": 0.9991959929466248, + "sampling/importance_sampling_ratio/min": 0.5327165722846985, + "sampling/sampling_logp_difference/max": 0.6297657489776611, + "sampling/sampling_logp_difference/mean": 0.011947166174650192, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 136.484375, + "completions/mean_terminated_length": 136.484375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2865660488605499, + "epoch": 0.41299019607843135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15011147787428938, + "kl": 0.068015456199646, + "learning_rate": 9.95895006911623e-07, + "loss": 0.0007, + "num_tokens": 10463676.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996687769889832, + "sampling/importance_sampling_ratio/min": 0.13205653429031372, + "sampling/sampling_logp_difference/max": 2.0245251655578613, + "sampling/sampling_logp_difference/mean": 0.02037999778985977, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 220.828125, + "completions/mean_terminated_length": 220.828125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.22336722910404205, + "epoch": 0.41421568627450983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041152110769261244, + "kl": 0.029732730239629745, + "learning_rate": 9.95803406583515e-07, + "loss": 0.0003, + "num_tokens": 10491585.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6976975202560425, + "sampling/importance_sampling_ratio/mean": 0.9999824166297913, + "sampling/importance_sampling_ratio/min": 0.5235568284988403, + "sampling/sampling_logp_difference/max": 0.6471096277236938, + "sampling/sampling_logp_difference/mean": 0.013062494806945324, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 168.59375, + "completions/mean_terminated_length": 168.59375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.29412657022476196, + "epoch": 0.41544117647058826, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.3569155139365905, + "kl": 0.08087459206581116, + "learning_rate": 9.957107998059018e-07, + "loss": -0.0774, + "num_tokens": 10518007.0, + "reward": 0.40625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.637667179107666, + "sampling/importance_sampling_ratio/mean": 0.9994561672210693, + "sampling/importance_sampling_ratio/min": 0.6117925047874451, + "sampling/sampling_logp_difference/max": 0.4932727813720703, + "sampling/sampling_logp_difference/mean": 0.017368581146001816, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 178.140625, + "completions/mean_terminated_length": 178.140625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.28629130125045776, + "epoch": 0.4166666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6632228991561313, + "kl": 0.05552738159894943, + "learning_rate": 9.956171867667693e-07, + "loss": -0.0419, + "num_tokens": 10552560.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6182644367218018, + "sampling/importance_sampling_ratio/mean": 0.9997725486755371, + "sampling/importance_sampling_ratio/min": 0.5164028406143188, + "sampling/sampling_logp_difference/max": 0.6608681678771973, + "sampling/sampling_logp_difference/mean": 0.01721036247909069, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 142.046875, + "completions/mean_terminated_length": 142.046875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.253487765789032, + "epoch": 0.4178921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07404755502753235, + "kl": 0.04503706842660904, + "learning_rate": 9.955225676561459e-07, + "loss": 0.0005, + "num_tokens": 10574387.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6683697700500488, + "sampling/importance_sampling_ratio/mean": 0.9993175268173218, + "sampling/importance_sampling_ratio/min": 0.39866963028907776, + "sampling/sampling_logp_difference/max": 0.9196221828460693, + "sampling/sampling_logp_difference/mean": 0.017505541443824768, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 203.265625, + "completions/mean_terminated_length": 203.265625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.1682276874780655, + "epoch": 0.41911764705882354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040842604358258025, + "kl": 0.028833290562033653, + "learning_rate": 9.954269426661022e-07, + "loss": 0.0003, + "num_tokens": 10611556.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 1.0004328489303589, + "sampling/importance_sampling_ratio/min": 0.512783944606781, + "sampling/sampling_logp_difference/max": 0.6679006814956665, + "sampling/sampling_logp_difference/mean": 0.01228379923850298, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 156.59375, + "completions/mean_terminated_length": 156.59375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.2435804307460785, + "epoch": 0.42034313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9371516107086189, + "kl": 0.04949047416448593, + "learning_rate": 9.953303119907513e-07, + "loss": -0.0202, + "num_tokens": 10639210.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994317293167114, + "sampling/importance_sampling_ratio/min": 0.264559268951416, + "sampling/sampling_logp_difference/max": 1.3296899795532227, + "sampling/sampling_logp_difference/mean": 0.01969146728515625, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 193.375, + "completions/mean_terminated_length": 193.375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2623113989830017, + "epoch": 0.4215686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.348681539886754, + "kl": 0.049027375876903534, + "learning_rate": 9.952326758262472e-07, + "loss": 0.0203, + "num_tokens": 10671058.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6695427894592285, + "sampling/importance_sampling_ratio/mean": 1.0002408027648926, + "sampling/importance_sampling_ratio/min": 0.5719989538192749, + "sampling/sampling_logp_difference/max": 0.5586180686950684, + "sampling/sampling_logp_difference/mean": 0.01638277992606163, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 182.625, + "completions/mean_terminated_length": 182.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.13473373651504517, + "epoch": 0.4227941176470588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031428930438624054, + "kl": 0.017639823257923126, + "learning_rate": 9.95134034370785e-07, + "loss": 0.0002, + "num_tokens": 10696410.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 1.0000739097595215, + "sampling/importance_sampling_ratio/min": 0.3698100745677948, + "sampling/sampling_logp_difference/max": 0.9947657585144043, + "sampling/sampling_logp_difference/mean": 0.01015438698232174, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 172.171875, + "completions/mean_terminated_length": 172.171875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.22618728876113892, + "epoch": 0.42401960784313725, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.1376996765904877, + "kl": 0.041539475321769714, + "learning_rate": 9.950343878246009e-07, + "loss": -0.0062, + "num_tokens": 10733653.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.9957953691482544, + "sampling/importance_sampling_ratio/mean": 1.0007014274597168, + "sampling/importance_sampling_ratio/min": 0.4055122137069702, + "sampling/sampling_logp_difference/max": 0.902604341506958, + "sampling/sampling_logp_difference/mean": 0.01611269637942314, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 247.140625, + "completions/mean_terminated_length": 247.140625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2703923285007477, + "epoch": 0.4252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05173105447890271, + "kl": 0.046294279396533966, + "learning_rate": 9.949337363899708e-07, + "loss": 0.0003, + "num_tokens": 10765342.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7525776624679565, + "sampling/importance_sampling_ratio/mean": 1.0000587701797485, + "sampling/importance_sampling_ratio/min": 0.4304106533527374, + "sampling/sampling_logp_difference/max": 0.8430154919624329, + "sampling/sampling_logp_difference/mean": 0.016863878816366196, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 178.78125, + "completions/mean_terminated_length": 178.78125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.26333087682724, + "epoch": 0.4264705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09253528431740904, + "kl": 0.05567295104265213, + "learning_rate": 9.948320802712107e-07, + "loss": 0.0005, + "num_tokens": 10792064.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5458966493606567, + "sampling/importance_sampling_ratio/mean": 1.000347375869751, + "sampling/importance_sampling_ratio/min": 0.5169248580932617, + "sampling/sampling_logp_difference/max": 0.6598577499389648, + "sampling/sampling_logp_difference/mean": 0.016856186091899872, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 265.40625, + "completions/mean_terminated_length": 265.40625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.19036869704723358, + "epoch": 0.42769607843137253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029572162983203348, + "kl": 0.01962047815322876, + "learning_rate": 9.947294196746762e-07, + "loss": 0.0002, + "num_tokens": 10827850.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005722045898438, + "sampling/importance_sampling_ratio/min": 0.474294513463974, + "sampling/sampling_logp_difference/max": 1.0838651657104492, + "sampling/sampling_logp_difference/mean": 0.01202384103089571, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 176.140625, + "completions/mean_terminated_length": 176.140625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.21644078195095062, + "epoch": 0.42892156862745096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22621544381853376, + "kl": 0.06590238213539124, + "learning_rate": 9.946257548087619e-07, + "loss": 0.0005, + "num_tokens": 10854979.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7846208810806274, + "sampling/importance_sampling_ratio/mean": 0.9999986290931702, + "sampling/importance_sampling_ratio/min": 0.390634685754776, + "sampling/sampling_logp_difference/max": 0.9399824738502502, + "sampling/sampling_logp_difference/mean": 0.01423841156065464, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 186.8125, + "completions/mean_terminated_length": 186.8125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.17167896032333374, + "epoch": 0.43014705882352944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05660215114493649, + "kl": 0.024589000269770622, + "learning_rate": 9.945210858839008e-07, + "loss": 0.0002, + "num_tokens": 10883751.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6356914043426514, + "sampling/importance_sampling_ratio/mean": 1.0000401735305786, + "sampling/importance_sampling_ratio/min": 0.5330818891525269, + "sampling/sampling_logp_difference/max": 0.6290802955627441, + "sampling/sampling_logp_difference/mean": 0.012451952323317528, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 280.234375, + "completions/mean_terminated_length": 280.234375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.22605139017105103, + "epoch": 0.43137254901960786, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.170995796907789, + "kl": 0.030084867030382156, + "learning_rate": 9.944154131125642e-07, + "loss": 0.0181, + "num_tokens": 10919542.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995243549346924, + "sampling/importance_sampling_ratio/min": 0.11731626838445663, + "sampling/sampling_logp_difference/max": 2.1428818702697754, + "sampling/sampling_logp_difference/mean": 0.014059376902878284, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 242.21875, + "completions/mean_terminated_length": 242.21875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.23910082876682281, + "epoch": 0.4325980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5118189053638529, + "kl": 0.03150876611471176, + "learning_rate": 9.94308736709261e-07, + "loss": 0.067, + "num_tokens": 10955588.0, + "reward": -0.34375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001499652862549, + "sampling/importance_sampling_ratio/min": 0.5876941084861755, + "sampling/sampling_logp_difference/max": 0.9266014099121094, + "sampling/sampling_logp_difference/mean": 0.015095872804522514, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 210.8125, + "completions/mean_terminated_length": 210.8125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.22580724954605103, + "epoch": 0.4338235294117647, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.348216584589421, + "kl": 0.02664954587817192, + "learning_rate": 9.94201056890538e-07, + "loss": -0.0162, + "num_tokens": 10986296.0, + "reward": 0.40625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996037483215332, + "sampling/importance_sampling_ratio/min": 0.4866310954093933, + "sampling/sampling_logp_difference/max": 0.7321920394897461, + "sampling/sampling_logp_difference/mean": 0.015312884002923965, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 283.90625, + "completions/mean_terminated_length": 283.90625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.28491437435150146, + "epoch": 0.43504901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2733266904092038, + "kl": 0.026450350880622864, + "learning_rate": 9.940923738749777e-07, + "loss": -0.0108, + "num_tokens": 11021714.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.79961359500885, + "sampling/importance_sampling_ratio/mean": 1.0002620220184326, + "sampling/importance_sampling_ratio/min": 0.4311363995075226, + "sampling/sampling_logp_difference/max": 0.8413307666778564, + "sampling/sampling_logp_difference/mean": 0.016569074243307114, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 222.71875, + "completions/mean_terminated_length": 222.71875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.25121283531188965, + "epoch": 0.4362745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04937892096437045, + "kl": 0.026176000013947487, + "learning_rate": 9.939826878832003e-07, + "loss": 0.0003, + "num_tokens": 11051440.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6256892681121826, + "sampling/importance_sampling_ratio/mean": 0.9997677803039551, + "sampling/importance_sampling_ratio/min": 0.3972400426864624, + "sampling/sampling_logp_difference/max": 0.9232145547866821, + "sampling/sampling_logp_difference/mean": 0.016339467838406563, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 240.640625, + "completions/mean_terminated_length": 240.640625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.32118406891822815, + "epoch": 0.4375, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1057094242743686, + "kl": 0.03562553972005844, + "learning_rate": 9.938719991378613e-07, + "loss": -0.0342, + "num_tokens": 11088825.0, + "reward": 0.28125, + "reward_std": 0.659286618232727, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.5646177530288696, + "sampling/importance_sampling_ratio/mean": 0.9996243119239807, + "sampling/importance_sampling_ratio/min": 0.4814416170120239, + "sampling/sampling_logp_difference/max": 0.7309703826904297, + "sampling/sampling_logp_difference/mean": 0.01743035763502121, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 198.3125, + "completions/mean_terminated_length": 198.3125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.28648340702056885, + "epoch": 0.4387254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5807774770939829, + "kl": 0.04289580509066582, + "learning_rate": 9.937603078636518e-07, + "loss": 0.0005, + "num_tokens": 11127069.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9991505742073059, + "sampling/importance_sampling_ratio/min": 0.22127984464168549, + "sampling/sampling_logp_difference/max": 1.5083271265029907, + "sampling/sampling_logp_difference/mean": 0.0200237687677145, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 163.546875, + "completions/mean_terminated_length": 163.546875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.22636833786964417, + "epoch": 0.43995098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.298762577698692, + "kl": 0.05313120782375336, + "learning_rate": 9.936476142872977e-07, + "loss": -0.0079, + "num_tokens": 11151088.0, + "reward": 0.3125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.806626319885254, + "sampling/importance_sampling_ratio/mean": 1.0003206729888916, + "sampling/importance_sampling_ratio/min": 0.48932701349258423, + "sampling/sampling_logp_difference/max": 0.7147243022918701, + "sampling/sampling_logp_difference/mean": 0.015751542523503304, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 207.84375, + "completions/mean_terminated_length": 207.84375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.26953721046447754, + "epoch": 0.4411764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6749487824217606, + "kl": 0.0324811227619648, + "learning_rate": 9.935339186375603e-07, + "loss": 0.0037, + "num_tokens": 11185222.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6977006196975708, + "sampling/importance_sampling_ratio/mean": 1.000344157218933, + "sampling/importance_sampling_ratio/min": 0.4160033166408539, + "sampling/sampling_logp_difference/max": 0.8770620226860046, + "sampling/sampling_logp_difference/mean": 0.016557641327381134, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 296.03125, + "completions/mean_terminated_length": 296.03125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2822401821613312, + "epoch": 0.4424019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5389362004608935, + "kl": 0.025479158386588097, + "learning_rate": 9.934192211452344e-07, + "loss": -0.0208, + "num_tokens": 11230536.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7178759574890137, + "sampling/importance_sampling_ratio/mean": 0.9992885589599609, + "sampling/importance_sampling_ratio/min": 0.35823339223861694, + "sampling/sampling_logp_difference/max": 1.0265705585479736, + "sampling/sampling_logp_difference/mean": 0.015677716583013535, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 282.828125, + "completions/mean_terminated_length": 282.828125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.24845105409622192, + "epoch": 0.44362745098039214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06160487922193275, + "kl": 0.02625298500061035, + "learning_rate": 9.933035220431487e-07, + "loss": 0.0002, + "num_tokens": 11266669.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5616941452026367, + "sampling/importance_sampling_ratio/mean": 0.9996004104614258, + "sampling/importance_sampling_ratio/min": 0.47252514958381653, + "sampling/sampling_logp_difference/max": 0.749664306640625, + "sampling/sampling_logp_difference/mean": 0.014867119491100311, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 213.8125, + "completions/mean_terminated_length": 213.8125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.26533043384552, + "epoch": 0.44485294117647056, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9237355703924648, + "kl": 0.04259977489709854, + "learning_rate": 9.931868215661647e-07, + "loss": -0.0115, + "num_tokens": 11296673.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8399380445480347, + "sampling/importance_sampling_ratio/mean": 0.9999451637268066, + "sampling/importance_sampling_ratio/min": 0.4808300733566284, + "sampling/sampling_logp_difference/max": 0.7322413921356201, + "sampling/sampling_logp_difference/mean": 0.01594679430127144, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 209.96875, + "completions/mean_terminated_length": 209.96875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.176650732755661, + "epoch": 0.44607843137254904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04743493969795115, + "kl": 0.02621782198548317, + "learning_rate": 9.930691199511773e-07, + "loss": 0.0003, + "num_tokens": 11323151.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000335454940796, + "sampling/importance_sampling_ratio/min": 0.5128016471862793, + "sampling/sampling_logp_difference/max": 0.7265121936798096, + "sampling/sampling_logp_difference/mean": 0.01218412071466446, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 256.109375, + "completions/mean_terminated_length": 256.109375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.24810782074928284, + "epoch": 0.44730392156862747, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7061983645548506, + "kl": 0.04609496146440506, + "learning_rate": 9.929504174371136e-07, + "loss": -0.0122, + "num_tokens": 11357430.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999706745147705, + "sampling/importance_sampling_ratio/min": 0.45373624563217163, + "sampling/sampling_logp_difference/max": 0.7902392148971558, + "sampling/sampling_logp_difference/mean": 0.014929584227502346, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 223.984375, + "completions/mean_terminated_length": 223.984375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2555466890335083, + "epoch": 0.4485294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3109607474275589, + "kl": 0.03534568101167679, + "learning_rate": 9.928307142649314e-07, + "loss": 0.0079, + "num_tokens": 11386405.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5941057205200195, + "sampling/importance_sampling_ratio/mean": 1.000087022781372, + "sampling/importance_sampling_ratio/min": 0.4885924458503723, + "sampling/sampling_logp_difference/max": 0.7162265777587891, + "sampling/sampling_logp_difference/mean": 0.01537355873733759, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 243.1875, + "completions/mean_terminated_length": 243.1875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2532086968421936, + "epoch": 0.4497549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5523789955813703, + "kl": 0.040189456194639206, + "learning_rate": 9.927100106776212e-07, + "loss": -0.0406, + "num_tokens": 11416769.0, + "reward": 0.75, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001037120819092, + "sampling/importance_sampling_ratio/min": 0.31536558270454407, + "sampling/sampling_logp_difference/max": 1.1540226936340332, + "sampling/sampling_logp_difference/mean": 0.01462498214095831, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 227.84375, + "completions/mean_terminated_length": 227.84375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2768535912036896, + "epoch": 0.45098039215686275, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8081566949498855, + "kl": 0.03865700215101242, + "learning_rate": 9.925883069202034e-07, + "loss": -0.1665, + "num_tokens": 11449927.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.972188115119934, + "sampling/importance_sampling_ratio/mean": 0.9997962117195129, + "sampling/importance_sampling_ratio/min": 0.4022789001464844, + "sampling/sampling_logp_difference/max": 0.9106096029281616, + "sampling/sampling_logp_difference/mean": 0.01726974919438362, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 319.421875, + "completions/mean_terminated_length": 319.421875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.28473591804504395, + "epoch": 0.4522058823529412, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8438301060346188, + "kl": 0.0417214035987854, + "learning_rate": 9.92465603239729e-07, + "loss": -0.0117, + "num_tokens": 11488962.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.0001962184906006, + "sampling/importance_sampling_ratio/min": 0.1541740596294403, + "sampling/sampling_logp_difference/max": 1.8696730136871338, + "sampling/sampling_logp_difference/mean": 0.015511645935475826, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 261.25, + "completions/mean_terminated_length": 261.25, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.2947255074977875, + "epoch": 0.4534313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2613504387461651, + "kl": 0.040817294269800186, + "learning_rate": 9.923418998852787e-07, + "loss": 0.0047, + "num_tokens": 11519922.0, + "reward": -0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000306367874146, + "sampling/importance_sampling_ratio/min": 0.35686278343200684, + "sampling/sampling_logp_difference/max": 1.030403971672058, + "sampling/sampling_logp_difference/mean": 0.016571637243032455, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 209.4375, + "completions/mean_terminated_length": 209.4375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2516079545021057, + "epoch": 0.45465686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061096885384244876, + "kl": 0.03814944624900818, + "learning_rate": 9.922171971079622e-07, + "loss": 0.0004, + "num_tokens": 11549982.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.995097041130066, + "sampling/importance_sampling_ratio/mean": 1.0001697540283203, + "sampling/importance_sampling_ratio/min": 0.5281258821487427, + "sampling/sampling_logp_difference/max": 0.690692663192749, + "sampling/sampling_logp_difference/mean": 0.014308227226138115, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 227.4375, + "completions/mean_terminated_length": 227.4375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.28589046001434326, + "epoch": 0.45588235294117646, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6545090252137973, + "kl": 0.062388550490140915, + "learning_rate": 9.920914951609186e-07, + "loss": 0.077, + "num_tokens": 11582650.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5279115438461304, + "sampling/importance_sampling_ratio/mean": 0.999853789806366, + "sampling/importance_sampling_ratio/min": 0.5709834694862366, + "sampling/sampling_logp_difference/max": 0.5603950023651123, + "sampling/sampling_logp_difference/mean": 0.013815401121973991, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 211.4375, + "completions/mean_terminated_length": 211.4375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2030314952135086, + "epoch": 0.4571078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05725399699741107, + "kl": 0.03566918522119522, + "learning_rate": 9.919647942993147e-07, + "loss": 0.0003, + "num_tokens": 11614246.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6054738759994507, + "sampling/importance_sampling_ratio/mean": 0.9997533559799194, + "sampling/importance_sampling_ratio/min": 0.5426429510116577, + "sampling/sampling_logp_difference/max": 0.6113038063049316, + "sampling/sampling_logp_difference/mean": 0.013296281918883324, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 268.515625, + "completions/mean_terminated_length": 268.515625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2816750109195709, + "epoch": 0.4583333333333333, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3493486641771302, + "kl": 0.05643295496702194, + "learning_rate": 9.918370947803455e-07, + "loss": -0.0437, + "num_tokens": 11654951.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4764822721481323, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.6057345867156982, + "sampling/sampling_logp_difference/max": 0.5013134479522705, + "sampling/sampling_logp_difference/mean": 0.0143747478723526, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 240.703125, + "completions/mean_terminated_length": 240.703125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2940741181373596, + "epoch": 0.45955882352941174, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6227521491911283, + "kl": 0.06486812233924866, + "learning_rate": 9.917083968632326e-07, + "loss": 0.008, + "num_tokens": 11685780.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001022815704346, + "sampling/importance_sampling_ratio/min": 0.5026050209999084, + "sampling/sampling_logp_difference/max": 0.732917308807373, + "sampling/sampling_logp_difference/mean": 0.016473714262247086, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 225.53125, + "completions/mean_terminated_length": 225.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.39877572655677795, + "epoch": 0.46078431372549017, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2824686087004187, + "kl": 0.08406936377286911, + "learning_rate": 9.915787008092246e-07, + "loss": 0.0294, + "num_tokens": 11722630.0, + "reward": 0.125, + "reward_std": 0.6047805547714233, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.9694862365722656, + "sampling/importance_sampling_ratio/mean": 1.000741958618164, + "sampling/importance_sampling_ratio/min": 0.5329796075820923, + "sampling/sampling_logp_difference/max": 0.6777727603912354, + "sampling/sampling_logp_difference/mean": 0.01972278580069542, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 184.296875, + "completions/mean_terminated_length": 184.296875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.25606122612953186, + "epoch": 0.46200980392156865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06060053819807976, + "kl": 0.04585852473974228, + "learning_rate": 9.914480068815961e-07, + "loss": 0.0005, + "num_tokens": 11754329.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6166514158248901, + "sampling/importance_sampling_ratio/mean": 1.0003126859664917, + "sampling/importance_sampling_ratio/min": 0.17059476673603058, + "sampling/sampling_logp_difference/max": 1.7684643268585205, + "sampling/sampling_logp_difference/mean": 0.016700156033039093, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 219.765625, + "completions/mean_terminated_length": 219.765625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.22416208684444427, + "epoch": 0.4632352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05300771080766106, + "kl": 0.03702693060040474, + "learning_rate": 9.913163153456482e-07, + "loss": 0.0004, + "num_tokens": 11783162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6099443435668945, + "sampling/importance_sampling_ratio/mean": 0.999893844127655, + "sampling/importance_sampling_ratio/min": 0.46763360500335693, + "sampling/sampling_logp_difference/max": 0.7600702047348022, + "sampling/sampling_logp_difference/mean": 0.01346520520746708, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 296.453125, + "completions/mean_terminated_length": 296.453125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.19604724645614624, + "epoch": 0.4644607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04359660214057077, + "kl": 0.03180554509162903, + "learning_rate": 9.91183626468706e-07, + "loss": 0.0003, + "num_tokens": 11821095.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071579217910767, + "sampling/importance_sampling_ratio/mean": 0.9998345375061035, + "sampling/importance_sampling_ratio/min": 0.573077380657196, + "sampling/sampling_logp_difference/max": 0.556734561920166, + "sampling/sampling_logp_difference/mean": 0.011631621047854424, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 230.515625, + "completions/mean_terminated_length": 230.515625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.33643507957458496, + "epoch": 0.46568627450980393, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8763197746933978, + "kl": 0.05861668288707733, + "learning_rate": 9.910499405201193e-07, + "loss": 0.0095, + "num_tokens": 11854712.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.613815426826477, + "sampling/importance_sampling_ratio/mean": 0.9995595216751099, + "sampling/importance_sampling_ratio/min": 0.1923624575138092, + "sampling/sampling_logp_difference/max": 1.6483738422393799, + "sampling/sampling_logp_difference/mean": 0.01788877323269844, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 173.984375, + "completions/mean_terminated_length": 173.984375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2999635636806488, + "epoch": 0.46691176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07982167416818635, + "kl": 0.06704611331224442, + "learning_rate": 9.909152577712625e-07, + "loss": 0.0006, + "num_tokens": 11880535.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5210472345352173, + "sampling/importance_sampling_ratio/mean": 1.0001461505889893, + "sampling/importance_sampling_ratio/min": 0.48961183428764343, + "sampling/sampling_logp_difference/max": 0.7141423225402832, + "sampling/sampling_logp_difference/mean": 0.018397457897663116, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 190.578125, + "completions/mean_terminated_length": 190.578125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.19606657326221466, + "epoch": 0.4681372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06131105622796064, + "kl": 0.045415766537189484, + "learning_rate": 9.907795784955326e-07, + "loss": 0.0005, + "num_tokens": 11909372.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.742784023284912, + "sampling/importance_sampling_ratio/mean": 0.9999132752418518, + "sampling/importance_sampling_ratio/min": 0.32741686701774597, + "sampling/sampling_logp_difference/max": 1.1165211200714111, + "sampling/sampling_logp_difference/mean": 0.013729160651564598, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 304.453125, + "completions/mean_terminated_length": 304.453125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.38282719254493713, + "epoch": 0.4693627450980392, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8662681575337112, + "kl": 0.0544503778219223, + "learning_rate": 9.906429029683504e-07, + "loss": -0.026, + "num_tokens": 11946009.0, + "reward": 0.0625, + "reward_std": 0.6645200252532959, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001490116119385, + "sampling/importance_sampling_ratio/min": 0.5103266835212708, + "sampling/sampling_logp_difference/max": 0.7706844806671143, + "sampling/sampling_logp_difference/mean": 0.016581425443291664, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 232.640625, + "completions/mean_terminated_length": 232.640625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.27952274680137634, + "epoch": 0.47058823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5734908930850455, + "kl": 0.050721801817417145, + "learning_rate": 9.90505231467158e-07, + "loss": -0.039, + "num_tokens": 11983330.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998074769973755, + "sampling/importance_sampling_ratio/min": 0.2890108525753021, + "sampling/sampling_logp_difference/max": 1.8018252849578857, + "sampling/sampling_logp_difference/mean": 0.018836161121726036, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 243.9375, + "completions/mean_terminated_length": 243.9375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2589993476867676, + "epoch": 0.47181372549019607, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4847781689335693, + "kl": 0.04911482334136963, + "learning_rate": 9.903665642714204e-07, + "loss": -0.0347, + "num_tokens": 12016574.0, + "reward": 0.21875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6172823905944824, + "sampling/importance_sampling_ratio/mean": 1.0000231266021729, + "sampling/importance_sampling_ratio/min": 0.5771543979644775, + "sampling/sampling_logp_difference/max": 0.5496454238891602, + "sampling/sampling_logp_difference/mean": 0.01421155035495758, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 284.515625, + "completions/mean_terminated_length": 284.515625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2348122000694275, + "epoch": 0.4730392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9464947450891469, + "kl": 0.028523039072752, + "learning_rate": 9.90226901662623e-07, + "loss": -0.0021, + "num_tokens": 12050415.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997332692146301, + "sampling/importance_sampling_ratio/min": 0.4180172085762024, + "sampling/sampling_logp_difference/max": 0.8722326755523682, + "sampling/sampling_logp_difference/mean": 0.014374499209225178, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 198.8125, + "completions/mean_terminated_length": 198.8125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2792658507823944, + "epoch": 0.4742647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2948043764887476, + "kl": 0.044489506632089615, + "learning_rate": 9.900862439242718e-07, + "loss": 0.0248, + "num_tokens": 12080835.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.934046745300293, + "sampling/importance_sampling_ratio/mean": 0.9999051690101624, + "sampling/importance_sampling_ratio/min": 0.4885071814060211, + "sampling/sampling_logp_difference/max": 0.7164011001586914, + "sampling/sampling_logp_difference/mean": 0.017194809392094612, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 179.6875, + "completions/mean_terminated_length": 179.6875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.32544904947280884, + "epoch": 0.47549019607843135, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.455400063814383, + "kl": 0.06876906752586365, + "learning_rate": 9.899445913418935e-07, + "loss": -0.0318, + "num_tokens": 12113871.0, + "reward": -0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.5903608798980713, + "sampling/importance_sampling_ratio/mean": 1.0007281303405762, + "sampling/importance_sampling_ratio/min": 0.5240790843963623, + "sampling/sampling_logp_difference/max": 0.6461126804351807, + "sampling/sampling_logp_difference/mean": 0.019794166088104248, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 191.46875, + "completions/mean_terminated_length": 191.46875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2183282971382141, + "epoch": 0.47671568627450983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05303185126965551, + "kl": 0.03167909383773804, + "learning_rate": 9.898019442030337e-07, + "loss": 0.0003, + "num_tokens": 12140765.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004873275756836, + "sampling/importance_sampling_ratio/min": 0.6091670989990234, + "sampling/sampling_logp_difference/max": 0.837608814239502, + "sampling/sampling_logp_difference/mean": 0.013596564531326294, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 186.625, + "completions/mean_terminated_length": 186.625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3464588522911072, + "epoch": 0.47794117647058826, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4594377353718966, + "kl": 0.06954923272132874, + "learning_rate": 9.89658302797257e-07, + "loss": 0.0105, + "num_tokens": 12166821.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.9741272926330566, + "sampling/importance_sampling_ratio/mean": 1.0001782178878784, + "sampling/importance_sampling_ratio/min": 0.48248496651649475, + "sampling/sampling_logp_difference/max": 0.7288055419921875, + "sampling/sampling_logp_difference/mean": 0.019971702247858047, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 227.65625, + "completions/mean_terminated_length": 227.65625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.23684090375900269, + "epoch": 0.4791666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05047677743379039, + "kl": 0.028256338089704514, + "learning_rate": 9.895136674161464e-07, + "loss": 0.0003, + "num_tokens": 12198687.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00074303150177, + "sampling/importance_sampling_ratio/min": 0.2696039378643036, + "sampling/sampling_logp_difference/max": 1.3108012676239014, + "sampling/sampling_logp_difference/mean": 0.01520511694252491, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 279.90625, + "completions/mean_terminated_length": 279.90625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.30703943967819214, + "epoch": 0.4803921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9915338354451154, + "kl": 0.044222671538591385, + "learning_rate": 9.893680383533024e-07, + "loss": 0.0671, + "num_tokens": 12236281.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6116913557052612, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.43365907669067383, + "sampling/sampling_logp_difference/max": 0.8354966640472412, + "sampling/sampling_logp_difference/mean": 0.016938215121626854, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 274.8125, + "completions/mean_terminated_length": 274.8125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3057442009449005, + "epoch": 0.48161764705882354, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7707064477493701, + "kl": 0.07207056879997253, + "learning_rate": 9.892214159043433e-07, + "loss": -0.0496, + "num_tokens": 12273821.0, + "reward": 0.75, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.7151470184326172, + "sampling/importance_sampling_ratio/mean": 0.9999011754989624, + "sampling/importance_sampling_ratio/min": 0.4128479063510895, + "sampling/sampling_logp_difference/max": 0.8846759796142578, + "sampling/sampling_logp_difference/mean": 0.01808691769838333, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 233.640625, + "completions/mean_terminated_length": 233.640625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.2136198878288269, + "epoch": 0.48284313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.328368413009168, + "kl": 0.03892651945352554, + "learning_rate": 9.890738003669027e-07, + "loss": 0.0535, + "num_tokens": 12304358.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000007152557373, + "sampling/importance_sampling_ratio/min": 0.4823680520057678, + "sampling/sampling_logp_difference/max": 0.9336278438568115, + "sampling/sampling_logp_difference/mean": 0.013449429534375668, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 245.71875, + "completions/mean_terminated_length": 245.71875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3100472092628479, + "epoch": 0.4840686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6724824585110218, + "kl": 0.04158332943916321, + "learning_rate": 9.889251920406312e-07, + "loss": -0.0308, + "num_tokens": 12334996.0, + "reward": -0.25, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5585296154022217, + "sampling/importance_sampling_ratio/mean": 1.000278115272522, + "sampling/importance_sampling_ratio/min": 0.26434141397476196, + "sampling/sampling_logp_difference/max": 1.3305137157440186, + "sampling/sampling_logp_difference/mean": 0.01594216749072075, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 382.78125, + "completions/mean_terminated_length": 382.78125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.18888866901397705, + "epoch": 0.4852941176470588, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7714379917556299, + "kl": 0.02613941766321659, + "learning_rate": 9.887755912271942e-07, + "loss": 0.0191, + "num_tokens": 12376486.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996835589408875, + "sampling/importance_sampling_ratio/min": 0.32474690675735474, + "sampling/sampling_logp_difference/max": 1.124709129333496, + "sampling/sampling_logp_difference/mean": 0.010552226565778255, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 251.859375, + "completions/mean_terminated_length": 251.859375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2508004605770111, + "epoch": 0.48651960784313725, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.227985971228998, + "kl": 0.06145814433693886, + "learning_rate": 9.886249982302718e-07, + "loss": -0.0478, + "num_tokens": 12410653.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000058889389038, + "sampling/importance_sampling_ratio/min": 0.2936866283416748, + "sampling/sampling_logp_difference/max": 1.2252418994903564, + "sampling/sampling_logp_difference/mean": 0.016481686383485794, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 985.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 313.921875, + "completions/mean_terminated_length": 313.921875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.28108274936676025, + "epoch": 0.4877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07062007147852627, + "kl": 0.05387008190155029, + "learning_rate": 9.884734133555585e-07, + "loss": 0.0004, + "num_tokens": 12447752.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000408411026001, + "sampling/importance_sampling_ratio/min": 0.5632816553115845, + "sampling/sampling_logp_difference/max": 0.9375678300857544, + "sampling/sampling_logp_difference/mean": 0.01568533293902874, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 210.546875, + "completions/mean_terminated_length": 210.546875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.21270164847373962, + "epoch": 0.4889705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0750156076656718, + "kl": 0.03275253623723984, + "learning_rate": 9.883208369107617e-07, + "loss": 0.0003, + "num_tokens": 12477115.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.809498906135559, + "sampling/importance_sampling_ratio/mean": 0.9995313882827759, + "sampling/importance_sampling_ratio/min": 0.21648813784122467, + "sampling/sampling_logp_difference/max": 1.530219554901123, + "sampling/sampling_logp_difference/mean": 0.014600234106183052, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 207.140625, + "completions/mean_terminated_length": 207.140625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.20452356338500977, + "epoch": 0.49019607843137253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06209651334677291, + "kl": 0.0376552939414978, + "learning_rate": 9.88167269205602e-07, + "loss": 0.0004, + "num_tokens": 12503556.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8523972034454346, + "sampling/importance_sampling_ratio/mean": 1.0001587867736816, + "sampling/importance_sampling_ratio/min": 0.6124428510665894, + "sampling/sampling_logp_difference/max": 0.6164805889129639, + "sampling/sampling_logp_difference/mean": 0.012564984150230885, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.20392152667045593, + "epoch": 0.49142156862745096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05089353030210747, + "kl": 0.02923227660357952, + "learning_rate": 9.880127105518122e-07, + "loss": 0.0003, + "num_tokens": 12538766.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8718515634536743, + "sampling/importance_sampling_ratio/mean": 1.0003801584243774, + "sampling/importance_sampling_ratio/min": 0.37230610847473145, + "sampling/sampling_logp_difference/max": 0.9880388975143433, + "sampling/sampling_logp_difference/mean": 0.012940148822963238, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 166.734375, + "completions/mean_terminated_length": 166.734375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.29518431425094604, + "epoch": 0.49264705882352944, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.529219640519847, + "kl": 0.05905640870332718, + "learning_rate": 9.878571612631363e-07, + "loss": 0.0531, + "num_tokens": 12564701.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6979831457138062, + "sampling/importance_sampling_ratio/mean": 0.9998425245285034, + "sampling/importance_sampling_ratio/min": 0.5172002911567688, + "sampling/sampling_logp_difference/max": 0.6593250036239624, + "sampling/sampling_logp_difference/mean": 0.018709469586610794, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 193.671875, + "completions/mean_terminated_length": 193.671875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2510262131690979, + "epoch": 0.49387254901960786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33292477718977276, + "kl": 0.04922737926244736, + "learning_rate": 9.8770062165533e-07, + "loss": 0.0005, + "num_tokens": 12599480.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5513496398925781, + "sampling/importance_sampling_ratio/mean": 0.999625563621521, + "sampling/importance_sampling_ratio/min": 0.30741992592811584, + "sampling/sampling_logp_difference/max": 1.1795406341552734, + "sampling/sampling_logp_difference/mean": 0.017182782292366028, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 145.90625, + "completions/mean_terminated_length": 145.90625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.19823022186756134, + "epoch": 0.4950980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.826081871373586, + "kl": 0.0499572679400444, + "learning_rate": 9.875430920461583e-07, + "loss": 0.0142, + "num_tokens": 12626114.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.693151593208313, + "sampling/importance_sampling_ratio/mean": 0.9995004534721375, + "sampling/importance_sampling_ratio/min": 0.23994095623493195, + "sampling/sampling_logp_difference/max": 1.4273624420166016, + "sampling/sampling_logp_difference/mean": 0.01597466692328453, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 181.0625, + "completions/mean_terminated_length": 181.0625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.19514986872673035, + "epoch": 0.4963235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0834316994079156, + "kl": 0.042915742844343185, + "learning_rate": 9.873845727553965e-07, + "loss": 0.0004, + "num_tokens": 12660022.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8722196817398071, + "sampling/importance_sampling_ratio/mean": 0.9999871850013733, + "sampling/importance_sampling_ratio/min": 0.6210613250732422, + "sampling/sampling_logp_difference/max": 0.6271247863769531, + "sampling/sampling_logp_difference/mean": 0.013638028874993324, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 173.671875, + "completions/mean_terminated_length": 173.671875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.23224231600761414, + "epoch": 0.49754901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20167192962952737, + "kl": 0.04924897104501724, + "learning_rate": 9.87225064104829e-07, + "loss": 0.0005, + "num_tokens": 12686065.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.815342903137207, + "sampling/importance_sampling_ratio/mean": 0.99947190284729, + "sampling/importance_sampling_ratio/min": 0.1088278517127037, + "sampling/sampling_logp_difference/max": 2.2179880142211914, + "sampling/sampling_logp_difference/mean": 0.016529075801372528, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 267.5, + "completions/mean_terminated_length": 267.5, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.16993097960948944, + "epoch": 0.4987745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035463469020333434, + "kl": 0.023991378024220467, + "learning_rate": 9.870645664182476e-07, + "loss": 0.0002, + "num_tokens": 12719585.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999097585678101, + "sampling/importance_sampling_ratio/min": 0.44757285714149475, + "sampling/sampling_logp_difference/max": 1.5597548484802246, + "sampling/sampling_logp_difference/mean": 0.011877389624714851, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 173.078125, + "completions/mean_terminated_length": 173.078125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.2073623687028885, + "epoch": 0.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06405288974078595, + "kl": 0.0290830098092556, + "learning_rate": 9.86903080021453e-07, + "loss": 0.0003, + "num_tokens": 12749046.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001136064529419, + "sampling/importance_sampling_ratio/min": 0.42510080337524414, + "sampling/sampling_logp_difference/max": 1.110346794128418, + "sampling/sampling_logp_difference/mean": 0.01655196025967598, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 269.015625, + "completions/mean_terminated_length": 269.015625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2286536544561386, + "epoch": 0.5012254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05105192718276332, + "kl": 0.023631129413843155, + "learning_rate": 9.867406052422523e-07, + "loss": 0.0002, + "num_tokens": 12788423.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997172355651855, + "sampling/importance_sampling_ratio/min": 0.4074231684207916, + "sampling/sampling_logp_difference/max": 0.9378218650817871, + "sampling/sampling_logp_difference/mean": 0.015676937997341156, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 242.40625, + "completions/mean_terminated_length": 242.40625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2490488588809967, + "epoch": 0.5024509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.140636246592225, + "kl": 0.04674581438302994, + "learning_rate": 9.865771424104587e-07, + "loss": 0.0026, + "num_tokens": 12821025.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.804461121559143, + "sampling/importance_sampling_ratio/mean": 1.000612497329712, + "sampling/importance_sampling_ratio/min": 0.5682246088981628, + "sampling/sampling_logp_difference/max": 0.5902619361877441, + "sampling/sampling_logp_difference/mean": 0.014655455946922302, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 328.0625, + "completions/mean_terminated_length": 328.0625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2791997492313385, + "epoch": 0.5036764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7274215866255332, + "kl": 0.03656107187271118, + "learning_rate": 9.864126918578919e-07, + "loss": -0.0169, + "num_tokens": 12860101.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.9620814323425293, + "sampling/importance_sampling_ratio/mean": 0.9998489618301392, + "sampling/importance_sampling_ratio/min": 0.27422481775283813, + "sampling/sampling_logp_difference/max": 1.293807029724121, + "sampling/sampling_logp_difference/mean": 0.017559055238962173, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 179.3125, + "completions/mean_terminated_length": 179.3125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.1782495230436325, + "epoch": 0.5049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10630724203547498, + "kl": 0.040607184171676636, + "learning_rate": 9.862472539183755e-07, + "loss": 0.0004, + "num_tokens": 12887065.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9096622467041016, + "sampling/importance_sampling_ratio/mean": 0.9999580979347229, + "sampling/importance_sampling_ratio/min": 0.138926163315773, + "sampling/sampling_logp_difference/max": 1.9738126993179321, + "sampling/sampling_logp_difference/mean": 0.01566651463508606, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 173.125, + "completions/mean_terminated_length": 173.125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.15380318462848663, + "epoch": 0.5061274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06374476952125667, + "kl": 0.02124452218413353, + "learning_rate": 9.860808289277385e-07, + "loss": 0.0002, + "num_tokens": 12915121.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002858638763428, + "sampling/importance_sampling_ratio/min": 0.4787582457065582, + "sampling/sampling_logp_difference/max": 0.8502476215362549, + "sampling/sampling_logp_difference/mean": 0.012755339965224266, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 196.6875, + "completions/mean_terminated_length": 196.6875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.18463312089443207, + "epoch": 0.5073529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.45936447945665526, + "kl": 0.041250187903642654, + "learning_rate": 9.859134172238128e-07, + "loss": 0.0004, + "num_tokens": 12942349.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.62086820602417, + "sampling/importance_sampling_ratio/mean": 0.9995936155319214, + "sampling/importance_sampling_ratio/min": 0.524817705154419, + "sampling/sampling_logp_difference/max": 0.6447043418884277, + "sampling/sampling_logp_difference/mean": 0.01295830775052309, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 186.421875, + "completions/mean_terminated_length": 186.421875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.1308806836605072, + "epoch": 0.508578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12053416482904825, + "kl": 0.021573197096586227, + "learning_rate": 9.857450191464337e-07, + "loss": 0.0002, + "num_tokens": 12970232.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999610424041748, + "sampling/importance_sampling_ratio/min": 0.20379473268985748, + "sampling/sampling_logp_difference/max": 1.590641975402832, + "sampling/sampling_logp_difference/mean": 0.011412444524466991, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 177.6875, + "completions/mean_terminated_length": 177.6875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.25102418661117554, + "epoch": 0.5098039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05888729256866505, + "kl": 0.04825008660554886, + "learning_rate": 9.855756350374386e-07, + "loss": 0.0005, + "num_tokens": 13010228.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.742524266242981, + "sampling/importance_sampling_ratio/mean": 0.9999663829803467, + "sampling/importance_sampling_ratio/min": 0.374472051858902, + "sampling/sampling_logp_difference/max": 0.9822380542755127, + "sampling/sampling_logp_difference/mean": 0.018441712483763695, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 209.625, + "completions/mean_terminated_length": 209.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.27332425117492676, + "epoch": 0.5110294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6965885955451983, + "kl": 0.06332144141197205, + "learning_rate": 9.854052652406665e-07, + "loss": 0.0244, + "num_tokens": 13044620.0, + "reward": -0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.958435297012329, + "sampling/importance_sampling_ratio/mean": 0.9991956949234009, + "sampling/importance_sampling_ratio/min": 0.2559586465358734, + "sampling/sampling_logp_difference/max": 1.3627393245697021, + "sampling/sampling_logp_difference/mean": 0.01965285651385784, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 201.765625, + "completions/mean_terminated_length": 201.765625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.20585131645202637, + "epoch": 0.5122549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4773288838893448, + "kl": 0.028390567749738693, + "learning_rate": 9.852339101019572e-07, + "loss": -0.0381, + "num_tokens": 13074701.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7199848890304565, + "sampling/importance_sampling_ratio/mean": 0.9992534518241882, + "sampling/importance_sampling_ratio/min": 0.3371962904930115, + "sampling/sampling_logp_difference/max": 1.087090015411377, + "sampling/sampling_logp_difference/mean": 0.013995368033647537, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 182.046875, + "completions/mean_terminated_length": 182.046875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.19686485826969147, + "epoch": 0.5134803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04236162971608435, + "kl": 0.036349326372146606, + "learning_rate": 9.85061569969151e-07, + "loss": 0.0003, + "num_tokens": 13107600.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6127358675003052, + "sampling/importance_sampling_ratio/mean": 1.0003676414489746, + "sampling/importance_sampling_ratio/min": 0.4934726655483246, + "sampling/sampling_logp_difference/max": 0.7062878608703613, + "sampling/sampling_logp_difference/mean": 0.015297004953026772, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 187.765625, + "completions/mean_terminated_length": 187.765625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.172816663980484, + "epoch": 0.5147058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04712307185938354, + "kl": 0.026017405092716217, + "learning_rate": 9.848882451920875e-07, + "loss": 0.0002, + "num_tokens": 13137985.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8718520402908325, + "sampling/importance_sampling_ratio/mean": 0.9995954036712646, + "sampling/importance_sampling_ratio/min": 0.02654479630291462, + "sampling/sampling_logp_difference/max": 3.6289215087890625, + "sampling/sampling_logp_difference/mean": 0.013399068266153336, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 181.640625, + "completions/mean_terminated_length": 181.640625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.1799655705690384, + "epoch": 0.5159313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1032649243091597, + "kl": 0.03207855671644211, + "learning_rate": 9.847139361226046e-07, + "loss": -0.0172, + "num_tokens": 13164330.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.611358642578125, + "sampling/importance_sampling_ratio/mean": 1.000049114227295, + "sampling/importance_sampling_ratio/min": 0.35595452785491943, + "sampling/sampling_logp_difference/max": 1.0329523086547852, + "sampling/sampling_logp_difference/mean": 0.01461087167263031, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 172.4375, + "completions/mean_terminated_length": 172.4375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.20940491557121277, + "epoch": 0.5171568627450981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07394043380277103, + "kl": 0.03250215947628021, + "learning_rate": 9.84538643114539e-07, + "loss": 0.0003, + "num_tokens": 13187846.0, + "reward": -1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996784925460815, + "sampling/importance_sampling_ratio/min": 0.5260618329048157, + "sampling/sampling_logp_difference/max": 0.8738067150115967, + "sampling/sampling_logp_difference/mean": 0.01687995158135891, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 148.125, + "completions/mean_terminated_length": 148.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.1665087342262268, + "epoch": 0.5183823529411765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08174180673496445, + "kl": 0.037060923874378204, + "learning_rate": 9.843623665237242e-07, + "loss": 0.0004, + "num_tokens": 13216654.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008127689361572, + "sampling/importance_sampling_ratio/min": 0.4871967136859894, + "sampling/sampling_logp_difference/max": 0.953502893447876, + "sampling/sampling_logp_difference/mean": 0.014931373298168182, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 193.03125, + "completions/mean_terminated_length": 193.03125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.18435706198215485, + "epoch": 0.5196078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328042297588035, + "kl": 0.02889874018728733, + "learning_rate": 9.841851067079908e-07, + "loss": 0.0003, + "num_tokens": 13245504.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.91254460811615, + "sampling/importance_sampling_ratio/mean": 0.9999415874481201, + "sampling/importance_sampling_ratio/min": 0.2732740640640259, + "sampling/sampling_logp_difference/max": 1.2972800731658936, + "sampling/sampling_logp_difference/mean": 0.014272722415626049, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 166.640625, + "completions/mean_terminated_length": 166.640625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2041907012462616, + "epoch": 0.5208333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0727736574864572, + "kl": 0.04394741728901863, + "learning_rate": 9.840068640271647e-07, + "loss": 0.0005, + "num_tokens": 13271129.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000685453414917, + "sampling/importance_sampling_ratio/min": 0.4440104067325592, + "sampling/sampling_logp_difference/max": 0.8119072914123535, + "sampling/sampling_logp_difference/mean": 0.015166521072387695, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 147.4375, + "completions/mean_terminated_length": 147.4375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.148129403591156, + "epoch": 0.5220588235294118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05450949523051405, + "kl": 0.032044753432273865, + "learning_rate": 9.838276388430675e-07, + "loss": 0.0003, + "num_tokens": 13295349.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6977241039276123, + "sampling/importance_sampling_ratio/mean": 1.000300645828247, + "sampling/importance_sampling_ratio/min": 0.6411559581756592, + "sampling/sampling_logp_difference/max": 0.529288649559021, + "sampling/sampling_logp_difference/mean": 0.012159367091953754, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 235.65625, + "completions/mean_terminated_length": 235.65625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.23714864253997803, + "epoch": 0.5232843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2307582275952513, + "kl": 0.05249298736453056, + "learning_rate": 9.836474315195147e-07, + "loss": 0.0382, + "num_tokens": 13332767.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.8917368650436401, + "sampling/importance_sampling_ratio/mean": 1.0008130073547363, + "sampling/importance_sampling_ratio/min": 0.44529467821121216, + "sampling/sampling_logp_difference/max": 0.8090190887451172, + "sampling/sampling_logp_difference/mean": 0.018667148426175117, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 133.59375, + "completions/mean_terminated_length": 133.59375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.16025342047214508, + "epoch": 0.5245098039215687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06014264560023584, + "kl": 0.03190026804804802, + "learning_rate": 9.83466242422316e-07, + "loss": 0.0003, + "num_tokens": 13356021.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001938343048096, + "sampling/importance_sampling_ratio/min": 0.4700278043746948, + "sampling/sampling_logp_difference/max": 0.7751011848449707, + "sampling/sampling_logp_difference/mean": 0.012300319038331509, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 210.09375, + "completions/mean_terminated_length": 210.09375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.22527235746383667, + "epoch": 0.5257352941176471, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9659498041966834, + "kl": 0.05424007773399353, + "learning_rate": 9.832840719192735e-07, + "loss": -0.0098, + "num_tokens": 13388075.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001205205917358, + "sampling/importance_sampling_ratio/min": 0.2378673404455185, + "sampling/sampling_logp_difference/max": 1.4360421895980835, + "sampling/sampling_logp_difference/mean": 0.017529528588056564, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 250.9375, + "completions/mean_terminated_length": 250.9375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.21268804371356964, + "epoch": 0.5269607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040568360055927316, + "kl": 0.03319639712572098, + "learning_rate": 9.831009203801822e-07, + "loss": 0.0002, + "num_tokens": 13424455.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6312425136566162, + "sampling/importance_sampling_ratio/mean": 0.9993811249732971, + "sampling/importance_sampling_ratio/min": 0.5135435461997986, + "sampling/sampling_logp_difference/max": 0.6664204597473145, + "sampling/sampling_logp_difference/mean": 0.01416376419365406, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 238.734375, + "completions/mean_terminated_length": 238.734375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.16849209368228912, + "epoch": 0.5281862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0314614954354919, + "kl": 0.024410758167505264, + "learning_rate": 9.829167881768277e-07, + "loss": 0.0002, + "num_tokens": 13460134.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001863241195679, + "sampling/importance_sampling_ratio/min": 0.3524653911590576, + "sampling/sampling_logp_difference/max": 1.0428028106689453, + "sampling/sampling_logp_difference/mean": 0.013139157555997372, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 257.328125, + "completions/mean_terminated_length": 257.328125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.29006510972976685, + "epoch": 0.5294117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.082350928696217, + "kl": 0.024110358208417892, + "learning_rate": 9.82731675682987e-07, + "loss": 0.017, + "num_tokens": 13496539.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9817054271697998, + "sampling/importance_sampling_ratio/mean": 1.0004513263702393, + "sampling/importance_sampling_ratio/min": 0.27278974652290344, + "sampling/sampling_logp_difference/max": 1.2990540266036987, + "sampling/sampling_logp_difference/mean": 0.017118092626333237, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 191.890625, + "completions/mean_terminated_length": 191.890625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.21561165153980255, + "epoch": 0.5306372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0240772764038137, + "kl": 0.05167914927005768, + "learning_rate": 9.825455832744266e-07, + "loss": 0.0336, + "num_tokens": 13529588.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001323223114014, + "sampling/importance_sampling_ratio/min": 0.5724638104438782, + "sampling/sampling_logp_difference/max": 0.724461555480957, + "sampling/sampling_logp_difference/mean": 0.014281702227890491, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 284.90625, + "completions/mean_terminated_length": 284.90625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.24130161106586456, + "epoch": 0.5318627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04439451275842723, + "kl": 0.021637115627527237, + "learning_rate": 9.823585113289023e-07, + "loss": 0.0002, + "num_tokens": 13576526.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000152349472046, + "sampling/importance_sampling_ratio/min": 0.2516302466392517, + "sampling/sampling_logp_difference/max": 1.3797945976257324, + "sampling/sampling_logp_difference/mean": 0.016831308603286743, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 217.5, + "completions/mean_terminated_length": 217.5, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.2319016456604004, + "epoch": 0.5330882352941176, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4675323426382914, + "kl": 0.04950553923845291, + "learning_rate": 9.821704602261585e-07, + "loss": 0.0118, + "num_tokens": 13612814.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997687935829163, + "sampling/importance_sampling_ratio/min": 0.5444316267967224, + "sampling/sampling_logp_difference/max": 0.9254751205444336, + "sampling/sampling_logp_difference/mean": 0.015987036749720573, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 176.875, + "completions/mean_terminated_length": 176.875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.18741881847381592, + "epoch": 0.5343137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.602791349310115, + "kl": 0.03300906717777252, + "learning_rate": 9.819814303479267e-07, + "loss": 0.0146, + "num_tokens": 13640854.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9876214265823364, + "sampling/importance_sampling_ratio/mean": 1.000852346420288, + "sampling/importance_sampling_ratio/min": 0.30076903104782104, + "sampling/sampling_logp_difference/max": 1.2014126777648926, + "sampling/sampling_logp_difference/mean": 0.014924418181180954, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 165.578125, + "completions/mean_terminated_length": 165.578125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.21863728761672974, + "epoch": 0.5355392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05073817343117938, + "kl": 0.04330986738204956, + "learning_rate": 9.817914220779256e-07, + "loss": 0.0004, + "num_tokens": 13669483.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 0.4652920067310333, + "sampling/sampling_logp_difference/max": 0.8106462955474854, + "sampling/sampling_logp_difference/mean": 0.01707840897142887, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.1936556100845337, + "epoch": 0.5367647058823529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15335244089432232, + "kl": 0.048208750784397125, + "learning_rate": 9.816004358018603e-07, + "loss": 0.0005, + "num_tokens": 13695995.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000241994857788, + "sampling/importance_sampling_ratio/min": 0.4078652560710907, + "sampling/sampling_logp_difference/max": 0.9372653961181641, + "sampling/sampling_logp_difference/mean": 0.015760913491249084, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 173.296875, + "completions/mean_terminated_length": 173.296875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.16149796545505524, + "epoch": 0.5379901960784313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058144732808689745, + "kl": 0.0249975323677063, + "learning_rate": 9.814084719074204e-07, + "loss": 0.0002, + "num_tokens": 13723310.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007734298706055, + "sampling/importance_sampling_ratio/min": 0.5054357051849365, + "sampling/sampling_logp_difference/max": 0.9605855941772461, + "sampling/sampling_logp_difference/mean": 0.014678625389933586, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 209.4375, + "completions/mean_terminated_length": 209.4375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.20305787026882172, + "epoch": 0.5392156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.211216325141133, + "kl": 0.06661221385002136, + "learning_rate": 9.81215530784281e-07, + "loss": -0.007, + "num_tokens": 13752570.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6356521844863892, + "sampling/importance_sampling_ratio/mean": 1.0004148483276367, + "sampling/importance_sampling_ratio/min": 0.4958733022212982, + "sampling/sampling_logp_difference/max": 0.701434850692749, + "sampling/sampling_logp_difference/mean": 0.013643559068441391, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 154.40625, + "completions/mean_terminated_length": 154.40625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.15312139689922333, + "epoch": 0.5404411764705882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0928211989498208, + "kl": 0.023275483399629593, + "learning_rate": 9.810216128240996e-07, + "loss": 0.0002, + "num_tokens": 13776740.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6610887050628662, + "sampling/importance_sampling_ratio/mean": 0.9994547367095947, + "sampling/importance_sampling_ratio/min": 0.45958274602890015, + "sampling/sampling_logp_difference/max": 0.7774362564086914, + "sampling/sampling_logp_difference/mean": 0.013531510718166828, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 221.78125, + "completions/mean_terminated_length": 221.78125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.17526143789291382, + "epoch": 0.5416666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05870698110631625, + "kl": 0.03668907284736633, + "learning_rate": 9.808267184205181e-07, + "loss": 0.0003, + "num_tokens": 13812566.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999535322189331, + "sampling/importance_sampling_ratio/min": 0.4309251308441162, + "sampling/sampling_logp_difference/max": 0.8418209552764893, + "sampling/sampling_logp_difference/mean": 0.014167534187436104, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 257.953125, + "completions/mean_terminated_length": 257.953125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.2431107461452484, + "epoch": 0.5428921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036190409674592666, + "kl": 0.03278276324272156, + "learning_rate": 9.806308479691594e-07, + "loss": 0.0003, + "num_tokens": 13850307.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.849806547164917, + "sampling/importance_sampling_ratio/mean": 0.9998242259025574, + "sampling/importance_sampling_ratio/min": 0.415968656539917, + "sampling/sampling_logp_difference/max": 0.8771454095840454, + "sampling/sampling_logp_difference/mean": 0.016216067597270012, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 153.953125, + "completions/mean_terminated_length": 153.953125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.22003903985023499, + "epoch": 0.5441176470588235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2512830909674193, + "kl": 0.06804344803094864, + "learning_rate": 9.80434001867628e-07, + "loss": 0.0006, + "num_tokens": 13885056.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6728312969207764, + "sampling/importance_sampling_ratio/mean": 0.9997739791870117, + "sampling/importance_sampling_ratio/min": 0.28261417150497437, + "sampling/sampling_logp_difference/max": 1.2636725902557373, + "sampling/sampling_logp_difference/mean": 0.018444612622261047, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 173.25, + "completions/mean_terminated_length": 173.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.18086552619934082, + "epoch": 0.5453431372549019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18589688409829208, + "kl": 0.03681885451078415, + "learning_rate": 9.802361805155097e-07, + "loss": 0.0004, + "num_tokens": 13909120.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9687495231628418, + "sampling/importance_sampling_ratio/mean": 1.0004655122756958, + "sampling/importance_sampling_ratio/min": 0.3846133351325989, + "sampling/sampling_logp_difference/max": 0.9555168151855469, + "sampling/sampling_logp_difference/mean": 0.014596343040466309, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 247.53125, + "completions/mean_terminated_length": 247.53125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.22909684479236603, + "epoch": 0.5465686274509803, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4066834384561027, + "kl": 0.03148864209651947, + "learning_rate": 9.800373843143683e-07, + "loss": -0.0148, + "num_tokens": 13952706.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.952415108680725, + "sampling/importance_sampling_ratio/mean": 0.9993150234222412, + "sampling/importance_sampling_ratio/min": 0.509168803691864, + "sampling/sampling_logp_difference/max": 0.6749756336212158, + "sampling/sampling_logp_difference/mean": 0.01676628738641739, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 192.53125, + "completions/mean_terminated_length": 192.53125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.21134613454341888, + "epoch": 0.5477941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04649144067970036, + "kl": 0.026450635865330696, + "learning_rate": 9.798376136677484e-07, + "loss": 0.0003, + "num_tokens": 13981988.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6316014528274536, + "sampling/importance_sampling_ratio/mean": 1.00005042552948, + "sampling/importance_sampling_ratio/min": 0.36388882994651794, + "sampling/sampling_logp_difference/max": 1.0109069347381592, + "sampling/sampling_logp_difference/mean": 0.01675606518983841, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 136.75, + "completions/mean_terminated_length": 136.75, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.18353894352912903, + "epoch": 0.5490196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07195227881882807, + "kl": 0.025355882942676544, + "learning_rate": 9.796368689811712e-07, + "loss": 0.0003, + "num_tokens": 14008724.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007448196411133, + "sampling/importance_sampling_ratio/min": 0.5391724109649658, + "sampling/sampling_logp_difference/max": 0.854957103729248, + "sampling/sampling_logp_difference/mean": 0.015763752162456512, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 160.984375, + "completions/mean_terminated_length": 160.984375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.19537027180194855, + "epoch": 0.5502450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.1786144413384627, + "kl": 0.043188292533159256, + "learning_rate": 9.79435150662136e-07, + "loss": -0.0323, + "num_tokens": 14032211.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6207278966903687, + "sampling/importance_sampling_ratio/mean": 0.9998423457145691, + "sampling/importance_sampling_ratio/min": 0.5002011656761169, + "sampling/sampling_logp_difference/max": 0.6927449703216553, + "sampling/sampling_logp_difference/mean": 0.014301232062280178, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 240.484375, + "completions/mean_terminated_length": 240.484375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.1776406466960907, + "epoch": 0.5514705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1684315466351016, + "kl": 0.01867382600903511, + "learning_rate": 9.792324591201177e-07, + "loss": -0.0018, + "num_tokens": 14071186.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003172159194946, + "sampling/importance_sampling_ratio/min": 0.4056454002857208, + "sampling/sampling_logp_difference/max": 1.3454623222351074, + "sampling/sampling_logp_difference/mean": 0.013690703548491001, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 202.015625, + "completions/mean_terminated_length": 202.015625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.23214983940124512, + "epoch": 0.5526960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2704091917734717, + "kl": 0.03874318674206734, + "learning_rate": 9.790287947665681e-07, + "loss": -0.0046, + "num_tokens": 14106003.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998515248298645, + "sampling/importance_sampling_ratio/min": 0.3239980638027191, + "sampling/sampling_logp_difference/max": 1.1270177364349365, + "sampling/sampling_logp_difference/mean": 0.015781264752149582, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 171.78125, + "completions/mean_terminated_length": 171.78125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.1903134286403656, + "epoch": 0.553921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06348400197571344, + "kl": 0.02989325113594532, + "learning_rate": 9.788241580149122e-07, + "loss": 0.0003, + "num_tokens": 14139461.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9100010395050049, + "sampling/importance_sampling_ratio/mean": 0.9999560713768005, + "sampling/importance_sampling_ratio/min": 0.5976356863975525, + "sampling/sampling_logp_difference/max": 0.6471037864685059, + "sampling/sampling_logp_difference/mean": 0.015057485550642014, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 138.28125, + "completions/mean_terminated_length": 138.28125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.18952172994613647, + "epoch": 0.5551470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09591476875993776, + "kl": 0.04611412063241005, + "learning_rate": 9.786185492805501e-07, + "loss": 0.0005, + "num_tokens": 14163767.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996397495269775, + "sampling/importance_sampling_ratio/min": 0.4265861511230469, + "sampling/sampling_logp_difference/max": 0.8519408702850342, + "sampling/sampling_logp_difference/mean": 0.01649576798081398, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 150.828125, + "completions/mean_terminated_length": 150.828125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.1970883011817932, + "epoch": 0.5563725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04987476389528286, + "kl": 0.04483035206794739, + "learning_rate": 9.784119689808542e-07, + "loss": 0.0004, + "num_tokens": 14193116.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001142024993896, + "sampling/importance_sampling_ratio/min": 0.4021225869655609, + "sampling/sampling_logp_difference/max": 0.9109983444213867, + "sampling/sampling_logp_difference/mean": 0.016262732446193695, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 186.5, + "completions/mean_terminated_length": 186.5, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.1636163592338562, + "epoch": 0.5575980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8765250651093985, + "kl": 0.016119275242090225, + "learning_rate": 9.782044175351699e-07, + "loss": -0.0518, + "num_tokens": 14225196.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8127318620681763, + "sampling/importance_sampling_ratio/mean": 0.9994531273841858, + "sampling/importance_sampling_ratio/min": 0.5260621905326843, + "sampling/sampling_logp_difference/max": 0.6423358917236328, + "sampling/sampling_logp_difference/mean": 0.012707584537565708, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 186.34375, + "completions/mean_terminated_length": 186.34375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.22686845064163208, + "epoch": 0.5588235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4617782678969478, + "kl": 0.058060139417648315, + "learning_rate": 9.779958953648129e-07, + "loss": 0.0103, + "num_tokens": 14256354.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003458261489868, + "sampling/importance_sampling_ratio/min": 0.24500620365142822, + "sampling/sampling_logp_difference/max": 2.2618141174316406, + "sampling/sampling_logp_difference/mean": 0.017254013568162918, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 161.0, + "completions/mean_terminated_length": 161.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.21629858016967773, + "epoch": 0.5600490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07544619848491381, + "kl": 0.047936707735061646, + "learning_rate": 9.777864028930705e-07, + "loss": 0.0004, + "num_tokens": 14282466.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003684759140015, + "sampling/importance_sampling_ratio/min": 0.5428855419158936, + "sampling/sampling_logp_difference/max": 0.7252051830291748, + "sampling/sampling_logp_difference/mean": 0.01494472287595272, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 209.75, + "completions/mean_terminated_length": 209.75, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.11612190306186676, + "epoch": 0.5612745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03621998780538075, + "kl": 0.014105316251516342, + "learning_rate": 9.775759405451986e-07, + "loss": 0.0001, + "num_tokens": 14316578.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000373125076294, + "sampling/importance_sampling_ratio/min": 0.39854487776756287, + "sampling/sampling_logp_difference/max": 0.9199352264404297, + "sampling/sampling_logp_difference/mean": 0.009833992458879948, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 193.5, + "completions/mean_terminated_length": 193.5, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.20763754844665527, + "epoch": 0.5625, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7741586485869132, + "kl": 0.03014390356838703, + "learning_rate": 9.773645087484228e-07, + "loss": -0.0053, + "num_tokens": 14345858.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6122909784317017, + "sampling/importance_sampling_ratio/mean": 0.9992804527282715, + "sampling/importance_sampling_ratio/min": 0.43093565106391907, + "sampling/sampling_logp_difference/max": 0.8417965173721313, + "sampling/sampling_logp_difference/mean": 0.01546657457947731, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 248.46875, + "completions/mean_terminated_length": 248.46875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.19547869265079498, + "epoch": 0.5637254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04159917279096464, + "kl": 0.026621131226420403, + "learning_rate": 9.771521079319363e-07, + "loss": 0.0002, + "num_tokens": 14382752.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9117350578308105, + "sampling/importance_sampling_ratio/mean": 1.0008268356323242, + "sampling/importance_sampling_ratio/min": 0.5671007037162781, + "sampling/sampling_logp_difference/max": 0.6480112075805664, + "sampling/sampling_logp_difference/mean": 0.013971008360385895, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 210.953125, + "completions/mean_terminated_length": 210.953125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.21355372667312622, + "epoch": 0.5649509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.262666879156973, + "kl": 0.027984999120235443, + "learning_rate": 9.76938738526899e-07, + "loss": -0.0045, + "num_tokens": 14417517.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994992017745972, + "sampling/importance_sampling_ratio/min": 0.11738044768571854, + "sampling/sampling_logp_difference/max": 2.1423349380493164, + "sampling/sampling_logp_difference/mean": 0.015855856239795685, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 245.5, + "completions/mean_terminated_length": 245.5, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.11451134085655212, + "epoch": 0.5661764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030398948615140087, + "kl": 0.010230224579572678, + "learning_rate": 9.767244009664376e-07, + "loss": 0.0001, + "num_tokens": 14455885.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7448595762252808, + "sampling/importance_sampling_ratio/mean": 0.9992393255233765, + "sampling/importance_sampling_ratio/min": 0.41925525665283203, + "sampling/sampling_logp_difference/max": 0.8692753314971924, + "sampling/sampling_logp_difference/mean": 0.010393896140158176, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 214.96875, + "completions/mean_terminated_length": 214.96875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.19482392072677612, + "epoch": 0.5674019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049231483585913345, + "kl": 0.027302060276269913, + "learning_rate": 9.765090956856435e-07, + "loss": 0.0002, + "num_tokens": 14488043.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5804944038391113, + "sampling/importance_sampling_ratio/mean": 0.9997097253799438, + "sampling/importance_sampling_ratio/min": 0.18332743644714355, + "sampling/sampling_logp_difference/max": 1.696481466293335, + "sampling/sampling_logp_difference/mean": 0.013040466234087944, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 218.9375, + "completions/mean_terminated_length": 218.9375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.20897147059440613, + "epoch": 0.5686274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5998888353151224, + "kl": 0.03444429486989975, + "learning_rate": 9.76292823121573e-07, + "loss": -0.0919, + "num_tokens": 14522503.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000460147857666, + "sampling/importance_sampling_ratio/min": 0.4952988624572754, + "sampling/sampling_logp_difference/max": 0.7552809715270996, + "sampling/sampling_logp_difference/mean": 0.01724259927868843, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 242.953125, + "completions/mean_terminated_length": 242.953125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.20893335342407227, + "epoch": 0.5698529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4165889380237286, + "kl": 0.02970290556550026, + "learning_rate": 9.760755837132457e-07, + "loss": 0.1361, + "num_tokens": 14563652.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001609325408936, + "sampling/importance_sampling_ratio/min": 0.4954153895378113, + "sampling/sampling_logp_difference/max": 1.0007104873657227, + "sampling/sampling_logp_difference/mean": 0.014235898852348328, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 144.1875, + "completions/mean_terminated_length": 144.1875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.1707545965909958, + "epoch": 0.571078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08934035938595794, + "kl": 0.026488985866308212, + "learning_rate": 9.758573779016436e-07, + "loss": 0.0003, + "num_tokens": 14584912.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007481575012207, + "sampling/importance_sampling_ratio/min": 0.2991707921028137, + "sampling/sampling_logp_difference/max": 1.2067406177520752, + "sampling/sampling_logp_difference/mean": 0.014445105567574501, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 257.671875, + "completions/mean_terminated_length": 257.671875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.17051297426223755, + "epoch": 0.5723039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1525910990314554, + "kl": 0.021965259686112404, + "learning_rate": 9.75638206129711e-07, + "loss": 0.0003, + "num_tokens": 14616699.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.697697401046753, + "sampling/importance_sampling_ratio/mean": 0.9999450445175171, + "sampling/importance_sampling_ratio/min": 0.4663052260875702, + "sampling/sampling_logp_difference/max": 0.7629148960113525, + "sampling/sampling_logp_difference/mean": 0.013579754158854485, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 213.0, + "completions/mean_terminated_length": 213.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.2851870059967041, + "epoch": 0.5735294117647058, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1066582547543717, + "kl": 0.053356803953647614, + "learning_rate": 9.754180688423524e-07, + "loss": -0.0102, + "num_tokens": 14649371.0, + "reward": 0.28125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001518726348877, + "sampling/importance_sampling_ratio/min": 0.46914398670196533, + "sampling/sampling_logp_difference/max": 0.7568455934524536, + "sampling/sampling_logp_difference/mean": 0.018635809421539307, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 216.890625, + "completions/mean_terminated_length": 216.890625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.24624094367027283, + "epoch": 0.5747549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3750226279227016, + "kl": 0.030825216323137283, + "learning_rate": 9.751969664864326e-07, + "loss": -0.0707, + "num_tokens": 14682580.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6819995641708374, + "sampling/importance_sampling_ratio/mean": 0.9998189210891724, + "sampling/importance_sampling_ratio/min": 0.2631002366542816, + "sampling/sampling_logp_difference/max": 1.335220217704773, + "sampling/sampling_logp_difference/mean": 0.017620351165533066, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 245.515625, + "completions/mean_terminated_length": 245.515625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.1737528145313263, + "epoch": 0.5759803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03394921308531916, + "kl": 0.017159651964902878, + "learning_rate": 9.749748995107756e-07, + "loss": 0.0002, + "num_tokens": 14715765.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999651312828064, + "sampling/importance_sampling_ratio/min": 0.4947980046272278, + "sampling/sampling_logp_difference/max": 0.751502513885498, + "sampling/sampling_logp_difference/mean": 0.014153973199427128, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 195.171875, + "completions/mean_terminated_length": 195.171875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.22031019628047943, + "epoch": 0.5772058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.060635684604694, + "kl": 0.03163802623748779, + "learning_rate": 9.74751868366163e-07, + "loss": 0.0385, + "num_tokens": 14746240.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5807092189788818, + "sampling/importance_sampling_ratio/mean": 0.9998636841773987, + "sampling/importance_sampling_ratio/min": 0.5622367858886719, + "sampling/sampling_logp_difference/max": 0.575832188129425, + "sampling/sampling_logp_difference/mean": 0.01457374356687069, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 262.109375, + "completions/mean_terminated_length": 262.109375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.26105907559394836, + "epoch": 0.5784313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1793484778611238, + "kl": 0.025164538994431496, + "learning_rate": 9.745278735053343e-07, + "loss": 0.0405, + "num_tokens": 14791287.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.969687581062317, + "sampling/importance_sampling_ratio/mean": 1.0003187656402588, + "sampling/importance_sampling_ratio/min": 0.19187341630458832, + "sampling/sampling_logp_difference/max": 1.6509194374084473, + "sampling/sampling_logp_difference/mean": 0.01665969006717205, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 285.921875, + "completions/mean_terminated_length": 285.921875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2519185543060303, + "epoch": 0.5796568627450981, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9432117433202565, + "kl": 0.04478658735752106, + "learning_rate": 9.743029153829845e-07, + "loss": -0.035, + "num_tokens": 14829010.0, + "reward": 0.5625, + "reward_std": 0.6707825064659119, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.9441853761672974, + "sampling/importance_sampling_ratio/mean": 1.0006245374679565, + "sampling/importance_sampling_ratio/min": 0.536237895488739, + "sampling/sampling_logp_difference/max": 0.6648430824279785, + "sampling/sampling_logp_difference/mean": 0.01648460328578949, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 169.5, + "completions/mean_terminated_length": 169.5, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.22762653231620789, + "epoch": 0.5808823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4965402464777053, + "kl": 0.030943382531404495, + "learning_rate": 9.740769944557644e-07, + "loss": -0.0013, + "num_tokens": 14863714.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.8166251182556152, + "sampling/importance_sampling_ratio/mean": 1.0000070333480835, + "sampling/importance_sampling_ratio/min": 0.4871021509170532, + "sampling/sampling_logp_difference/max": 0.7192814350128174, + "sampling/sampling_logp_difference/mean": 0.016907792538404465, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 156.59375, + "completions/mean_terminated_length": 156.59375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.24233081936836243, + "epoch": 0.5821078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.636056679673925, + "kl": 0.04024767875671387, + "learning_rate": 9.738501111822792e-07, + "loss": -0.0159, + "num_tokens": 14896408.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000391960144043, + "sampling/importance_sampling_ratio/min": 0.3063385486602783, + "sampling/sampling_logp_difference/max": 1.311995267868042, + "sampling/sampling_logp_difference/mean": 0.018402356654405594, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 147.84375, + "completions/mean_terminated_length": 147.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.13406597077846527, + "epoch": 0.5833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10571691731999149, + "kl": 0.020086705684661865, + "learning_rate": 9.736222660230878e-07, + "loss": 0.0002, + "num_tokens": 14927758.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8623803853988647, + "sampling/importance_sampling_ratio/mean": 1.0003776550292969, + "sampling/importance_sampling_ratio/min": 0.3642270565032959, + "sampling/sampling_logp_difference/max": 1.0099778175354004, + "sampling/sampling_logp_difference/mean": 0.013062086887657642, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 212.078125, + "completions/mean_terminated_length": 212.078125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.2687431275844574, + "epoch": 0.5845588235294118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04975967264875057, + "kl": 0.02250150591135025, + "learning_rate": 9.73393459440701e-07, + "loss": 0.0002, + "num_tokens": 14961219.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7017096281051636, + "sampling/importance_sampling_ratio/mean": 0.9993106126785278, + "sampling/importance_sampling_ratio/min": 0.410661906003952, + "sampling/sampling_logp_difference/max": 0.8899850845336914, + "sampling/sampling_logp_difference/mean": 0.01792370155453682, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 5000.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 360.171875, + "completions/mean_terminated_length": 286.5238342285156, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2159585952758789, + "epoch": 0.5857843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0133469177612149, + "kl": 0.016694650053977966, + "learning_rate": 9.73163691899582e-07, + "loss": 0.0339, + "num_tokens": 15004382.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003795623779297, + "sampling/importance_sampling_ratio/min": 0.4347582757472992, + "sampling/sampling_logp_difference/max": 0.8329651355743408, + "sampling/sampling_logp_difference/mean": 0.013661239296197891, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 128.109375, + "completions/mean_terminated_length": 128.109375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.1839389204978943, + "epoch": 0.5870098039215687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07097632624473711, + "kl": 0.03193610906600952, + "learning_rate": 9.729329638661444e-07, + "loss": 0.0003, + "num_tokens": 15030645.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9905723333358765, + "sampling/importance_sampling_ratio/mean": 0.9994369149208069, + "sampling/importance_sampling_ratio/min": 0.48416680097579956, + "sampling/sampling_logp_difference/max": 0.7253258228302002, + "sampling/sampling_logp_difference/mean": 0.01541130430996418, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 291.890625, + "completions/mean_terminated_length": 291.890625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.1650594174861908, + "epoch": 0.5882352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04103054333466548, + "kl": 0.017028018832206726, + "learning_rate": 9.727012758087512e-07, + "loss": 0.0002, + "num_tokens": 15068974.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998735189437866, + "sampling/importance_sampling_ratio/min": 0.47690320014953613, + "sampling/sampling_logp_difference/max": 0.7404417991638184, + "sampling/sampling_logp_difference/mean": 0.012647148221731186, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 228.65625, + "completions/mean_terminated_length": 228.65625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2747301459312439, + "epoch": 0.5894607843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8292346976018947, + "kl": 0.03598939999938011, + "learning_rate": 9.724686281977146e-07, + "loss": 0.0381, + "num_tokens": 15103848.0, + "reward": -0.03125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8844454288482666, + "sampling/importance_sampling_ratio/mean": 1.000560998916626, + "sampling/importance_sampling_ratio/min": 0.272928386926651, + "sampling/sampling_logp_difference/max": 1.2985458374023438, + "sampling/sampling_logp_difference/mean": 0.01755647361278534, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 174.578125, + "completions/mean_terminated_length": 174.578125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.2639964520931244, + "epoch": 0.5906862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6421195753360995, + "kl": 0.039081934839487076, + "learning_rate": 9.722350215052946e-07, + "loss": -0.0048, + "num_tokens": 15137453.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.9253288507461548, + "sampling/importance_sampling_ratio/mean": 1.0001544952392578, + "sampling/importance_sampling_ratio/min": 0.4358985722064972, + "sampling/sampling_logp_difference/max": 0.8303457498550415, + "sampling/sampling_logp_difference/mean": 0.019417976960539818, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 198.59375, + "completions/mean_terminated_length": 198.59375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.16699634492397308, + "epoch": 0.5919117647058824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04647039796785461, + "kl": 0.022983193397521973, + "learning_rate": 9.720004562056979e-07, + "loss": 0.0002, + "num_tokens": 15168051.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.793419361114502, + "sampling/importance_sampling_ratio/mean": 1.0000264644622803, + "sampling/importance_sampling_ratio/min": 0.5038398504257202, + "sampling/sampling_logp_difference/max": 0.6854968070983887, + "sampling/sampling_logp_difference/mean": 0.012654967606067657, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 182.703125, + "completions/mean_terminated_length": 182.703125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.24964210391044617, + "epoch": 0.5931372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4106894254929927, + "kl": 0.048736389726400375, + "learning_rate": 9.717649327750773e-07, + "loss": -0.0088, + "num_tokens": 15198944.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993613958358765, + "sampling/importance_sampling_ratio/min": 0.4776141345500946, + "sampling/sampling_logp_difference/max": 1.0320630073547363, + "sampling/sampling_logp_difference/mean": 0.018597744405269623, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 204.65625, + "completions/mean_terminated_length": 204.65625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.18694047629833221, + "epoch": 0.5943627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05743230416692876, + "kl": 0.027357913553714752, + "learning_rate": 9.7152845169153e-07, + "loss": 0.0003, + "num_tokens": 15229242.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9612669944763184, + "sampling/importance_sampling_ratio/mean": 1.000275731086731, + "sampling/importance_sampling_ratio/min": 0.44152477383613586, + "sampling/sampling_logp_difference/max": 0.8175210952758789, + "sampling/sampling_logp_difference/mean": 0.01288670301437378, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 279.234375, + "completions/mean_terminated_length": 279.234375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.26772433519363403, + "epoch": 0.5955882352941176, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.518300448825345, + "kl": 0.021690839901566505, + "learning_rate": 9.712910134350984e-07, + "loss": -0.0305, + "num_tokens": 15264617.0, + "reward": 0.1875, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998605251312256, + "sampling/importance_sampling_ratio/min": 0.4105510711669922, + "sampling/sampling_logp_difference/max": 1.3690159320831299, + "sampling/sampling_logp_difference/mean": 0.017053838819265366, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 193.078125, + "completions/mean_terminated_length": 193.078125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.20557519793510437, + "epoch": 0.5968137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4254188091002689, + "kl": 0.029017888009548187, + "learning_rate": 9.710526184877666e-07, + "loss": -0.0031, + "num_tokens": 15291246.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.8111883401870728, + "sampling/importance_sampling_ratio/mean": 1.000805377960205, + "sampling/importance_sampling_ratio/min": 0.5910916924476624, + "sampling/sampling_logp_difference/max": 0.5939831733703613, + "sampling/sampling_logp_difference/mean": 0.013951467350125313, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 150.71875, + "completions/mean_terminated_length": 150.71875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2271089106798172, + "epoch": 0.5980392156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.188118279180718, + "kl": 0.03827241063117981, + "learning_rate": 9.708132673334615e-07, + "loss": 0.0009, + "num_tokens": 15317116.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997679591178894, + "sampling/importance_sampling_ratio/min": 0.22370855510234833, + "sampling/sampling_logp_difference/max": 1.4974112510681152, + "sampling/sampling_logp_difference/mean": 0.016535578295588493, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.333241730928421, + "epoch": 0.5992647058823529, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.288477586033347, + "kl": 0.030107948929071426, + "learning_rate": 9.705729604580505e-07, + "loss": 0.0076, + "num_tokens": 15351376.0, + "reward": 0.53125, + "reward_std": 0.5809217691421509, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6156491041183472, + "sampling/importance_sampling_ratio/mean": 1.0002347230911255, + "sampling/importance_sampling_ratio/min": 0.4024190306663513, + "sampling/sampling_logp_difference/max": 0.9102613925933838, + "sampling/sampling_logp_difference/mean": 0.018553704023361206, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 173.578125, + "completions/mean_terminated_length": 173.578125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.20034442842006683, + "epoch": 0.6004901960784313, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.652665815530287, + "kl": 0.04240548610687256, + "learning_rate": 9.703316983493412e-07, + "loss": 0.0079, + "num_tokens": 15379189.0, + "reward": 0.0625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004112720489502, + "sampling/importance_sampling_ratio/min": 0.5910095572471619, + "sampling/sampling_logp_difference/max": 0.7670514583587646, + "sampling/sampling_logp_difference/mean": 0.014264474622905254, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 311.359375, + "completions/mean_terminated_length": 311.359375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.24044373631477356, + "epoch": 0.6017156862745098, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1011466606746954, + "kl": 0.04351915419101715, + "learning_rate": 9.700894814970808e-07, + "loss": 0.0258, + "num_tokens": 15413612.0, + "reward": 0.25, + "reward_std": 0.8767043352127075, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.8453627824783325, + "sampling/importance_sampling_ratio/mean": 0.9997578859329224, + "sampling/importance_sampling_ratio/min": 0.6080636978149414, + "sampling/sampling_logp_difference/max": 0.6126759052276611, + "sampling/sampling_logp_difference/mean": 0.014367097988724709, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 191.953125, + "completions/mean_terminated_length": 191.953125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.28681284189224243, + "epoch": 0.6029411764705882, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8833523639178975, + "kl": 0.04870586842298508, + "learning_rate": 9.698463103929541e-07, + "loss": 0.0007, + "num_tokens": 15442297.0, + "reward": 0.34375, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5971494913101196, + "sampling/importance_sampling_ratio/mean": 0.9996565580368042, + "sampling/importance_sampling_ratio/min": 0.4826450049877167, + "sampling/sampling_logp_difference/max": 0.7284739017486572, + "sampling/sampling_logp_difference/mean": 0.01625833474099636, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 187.0625, + "completions/mean_terminated_length": 187.0625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.20023608207702637, + "epoch": 0.6041666666666666, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.173857085033842, + "kl": 0.03565539792180061, + "learning_rate": 9.69602185530583e-07, + "loss": -0.0005, + "num_tokens": 15472461.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0011024475097656, + "sampling/importance_sampling_ratio/min": 0.18803824484348297, + "sampling/sampling_logp_difference/max": 1.671109914779663, + "sampling/sampling_logp_difference/mean": 0.01602831482887268, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 199.921875, + "completions/mean_terminated_length": 199.921875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.2912241816520691, + "epoch": 0.6053921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1771468516592747, + "kl": 0.04060687497258186, + "learning_rate": 9.693571074055254e-07, + "loss": 0.0522, + "num_tokens": 15501400.0, + "reward": 0.25, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995718002319336, + "sampling/importance_sampling_ratio/min": 0.5355345606803894, + "sampling/sampling_logp_difference/max": 1.2372488975524902, + "sampling/sampling_logp_difference/mean": 0.01795007474720478, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 241.625, + "completions/mean_terminated_length": 241.625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.2918547987937927, + "epoch": 0.6066176470588235, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1390454655609363, + "kl": 0.04890880733728409, + "learning_rate": 9.691110765152744e-07, + "loss": -0.0128, + "num_tokens": 15534832.0, + "reward": -0.53125, + "reward_std": 0.7129635810852051, + "rewards/decision_reward_func/mean": -0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995063543319702, + "sampling/importance_sampling_ratio/min": 0.39389124512672424, + "sampling/sampling_logp_difference/max": 0.93168044090271, + "sampling/sampling_logp_difference/mean": 0.0170971117913723, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 202.90625, + "completions/mean_terminated_length": 202.90625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.22431150078773499, + "epoch": 0.6078431372549019, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1644071437761385, + "kl": 0.04543592780828476, + "learning_rate": 9.688640933592572e-07, + "loss": 0.0014, + "num_tokens": 15563114.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.8400806188583374, + "sampling/importance_sampling_ratio/mean": 0.9998714327812195, + "sampling/importance_sampling_ratio/min": 0.5355109572410583, + "sampling/sampling_logp_difference/max": 0.6245338916778564, + "sampling/sampling_logp_difference/mean": 0.013310464099049568, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 247.390625, + "completions/mean_terminated_length": 247.390625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.32082051038742065, + "epoch": 0.6090686274509803, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7337860353603984, + "kl": 0.038668520748615265, + "learning_rate": 9.686161584388339e-07, + "loss": -0.0442, + "num_tokens": 15595955.0, + "reward": 0.1875, + "reward_std": 0.7473389506340027, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.8616005182266235, + "sampling/importance_sampling_ratio/mean": 1.001033067703247, + "sampling/importance_sampling_ratio/min": 0.4488525390625, + "sampling/sampling_logp_difference/max": 0.8010609149932861, + "sampling/sampling_logp_difference/mean": 0.020086873322725296, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 198.28125, + "completions/mean_terminated_length": 198.28125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.36790406703948975, + "epoch": 0.6102941176470589, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.712605465705619, + "kl": 0.052883580327034, + "learning_rate": 9.683672722572966e-07, + "loss": -0.0124, + "num_tokens": 15625461.0, + "reward": 0.25, + "reward_std": 0.8881268501281738, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995077848434448, + "sampling/importance_sampling_ratio/min": 0.2736213505268097, + "sampling/sampling_logp_difference/max": 1.2960100173950195, + "sampling/sampling_logp_difference/mean": 0.021305786445736885, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 296.984375, + "completions/mean_terminated_length": 296.984375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2703689634799957, + "epoch": 0.6115196078431373, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.637297883464752, + "kl": 0.024464547634124756, + "learning_rate": 9.681174353198686e-07, + "loss": -0.0025, + "num_tokens": 15662500.0, + "reward": 0.75, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.9148050546646118, + "sampling/importance_sampling_ratio/mean": 0.9993337392807007, + "sampling/importance_sampling_ratio/min": 0.4634568393230438, + "sampling/sampling_logp_difference/max": 0.7690420150756836, + "sampling/sampling_logp_difference/mean": 0.015952816233038902, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 187.9375, + "completions/mean_terminated_length": 187.9375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.25232523679733276, + "epoch": 0.6127450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2686866057794326, + "kl": 0.03192438930273056, + "learning_rate": 9.678666481337031e-07, + "loss": -0.079, + "num_tokens": 15692992.0, + "reward": 0.0625, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6614391803741455, + "sampling/importance_sampling_ratio/mean": 1.0009832382202148, + "sampling/importance_sampling_ratio/min": 0.4818120300769806, + "sampling/sampling_logp_difference/max": 0.730201244354248, + "sampling/sampling_logp_difference/mean": 0.016551656648516655, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 245.5625, + "completions/mean_terminated_length": 245.5625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.25726181268692017, + "epoch": 0.6139705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.560473235310796, + "kl": 0.053573839366436005, + "learning_rate": 9.67614911207882e-07, + "loss": -0.0075, + "num_tokens": 15725348.0, + "reward": -0.1875, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992358088493347, + "sampling/importance_sampling_ratio/min": 0.4940890669822693, + "sampling/sampling_logp_difference/max": 0.7050395011901855, + "sampling/sampling_logp_difference/mean": 0.014738351106643677, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 285.0625, + "completions/mean_terminated_length": 285.0625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.21006496250629425, + "epoch": 0.6151960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7539990683844793, + "kl": 0.02914992719888687, + "learning_rate": 9.673622250534155e-07, + "loss": -0.0089, + "num_tokens": 15763160.0, + "reward": 0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5782129764556885, + "sampling/importance_sampling_ratio/mean": 0.9996501207351685, + "sampling/importance_sampling_ratio/min": 0.4119178354740143, + "sampling/sampling_logp_difference/max": 0.8869314193725586, + "sampling/sampling_logp_difference/mean": 0.01188575103878975, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 144.296875, + "completions/mean_terminated_length": 144.296875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.16736435890197754, + "epoch": 0.616421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6543420321245603, + "kl": 0.04313846677541733, + "learning_rate": 9.671085901832404e-07, + "loss": 0.048, + "num_tokens": 15785435.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.9100173711776733, + "sampling/importance_sampling_ratio/mean": 0.9994516968727112, + "sampling/importance_sampling_ratio/min": 0.04118496552109718, + "sampling/sampling_logp_difference/max": 3.1896820068359375, + "sampling/sampling_logp_difference/mean": 0.013791139237582684, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 191.046875, + "completions/mean_terminated_length": 191.046875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.22429190576076508, + "epoch": 0.6176470588235294, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.4627974240636274, + "kl": 0.045783478766679764, + "learning_rate": 9.668540071122195e-07, + "loss": -0.0016, + "num_tokens": 15813630.0, + "reward": 0.21875, + "reward_std": 0.6505630612373352, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.7294622659683228, + "sampling/importance_sampling_ratio/mean": 0.999798595905304, + "sampling/importance_sampling_ratio/min": 0.37196803092956543, + "sampling/sampling_logp_difference/max": 0.9889473915100098, + "sampling/sampling_logp_difference/mean": 0.015147536993026733, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 217.34375, + "completions/mean_terminated_length": 217.34375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.27516812086105347, + "epoch": 0.6188725490196079, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5642094762042984, + "kl": 0.06203371286392212, + "learning_rate": 9.665984763571402e-07, + "loss": -0.0135, + "num_tokens": 15845908.0, + "reward": 0.84375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5175708532333374, + "sampling/importance_sampling_ratio/mean": 0.9990229606628418, + "sampling/importance_sampling_ratio/min": 0.41066715121269226, + "sampling/sampling_logp_difference/max": 0.8899722099304199, + "sampling/sampling_logp_difference/mean": 0.01645125448703766, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 194.25, + "completions/mean_terminated_length": 194.25, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.23022127151489258, + "epoch": 0.6200980392156863, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.063309763162759, + "kl": 0.04602504521608353, + "learning_rate": 9.663419984367137e-07, + "loss": -0.0305, + "num_tokens": 15877268.0, + "reward": 0.0625, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.817983865737915, + "sampling/importance_sampling_ratio/mean": 1.0002416372299194, + "sampling/importance_sampling_ratio/min": 0.6171835660934448, + "sampling/sampling_logp_difference/max": 0.5977281332015991, + "sampling/sampling_logp_difference/mean": 0.015211190097033978, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 226.5, + "completions/mean_terminated_length": 226.5, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2860516309738159, + "epoch": 0.6213235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3938382172467045, + "kl": 0.05346071720123291, + "learning_rate": 9.660845738715742e-07, + "loss": 0.0177, + "num_tokens": 15907652.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 0.9996599555015564, + "sampling/importance_sampling_ratio/min": 0.5685405731201172, + "sampling/sampling_logp_difference/max": 0.5646826028823853, + "sampling/sampling_logp_difference/mean": 0.01554498914629221, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 174.3125, + "completions/mean_terminated_length": 174.3125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.33097076416015625, + "epoch": 0.6225490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.13984277246622, + "kl": 0.0651451051235199, + "learning_rate": 9.658262031842769e-07, + "loss": -0.0326, + "num_tokens": 15937208.0, + "reward": -0.3125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.7676414251327515, + "sampling/importance_sampling_ratio/mean": 1.0007094144821167, + "sampling/importance_sampling_ratio/min": 0.5665751695632935, + "sampling/sampling_logp_difference/max": 0.5696461200714111, + "sampling/sampling_logp_difference/mean": 0.018425047397613525, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 187.65625, + "completions/mean_terminated_length": 187.65625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.24647732079029083, + "epoch": 0.6237745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.771429223174817, + "kl": 0.056372880935668945, + "learning_rate": 9.655668868992983e-07, + "loss": -0.0004, + "num_tokens": 15968802.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9991722106933594, + "sampling/importance_sampling_ratio/min": 0.09368312358856201, + "sampling/sampling_logp_difference/max": 2.3678371906280518, + "sampling/sampling_logp_difference/mean": 0.016202397644519806, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 152.09375, + "completions/mean_terminated_length": 152.09375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.21541383862495422, + "epoch": 0.625, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.9867445811566165, + "kl": 0.05038762837648392, + "learning_rate": 9.653066255430338e-07, + "loss": 0.0541, + "num_tokens": 15994024.0, + "reward": 0.46875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999164342880249, + "sampling/importance_sampling_ratio/min": 0.34088990092277527, + "sampling/sampling_logp_difference/max": 1.0761957168579102, + "sampling/sampling_logp_difference/mean": 0.01494535617530346, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 235.359375, + "completions/mean_terminated_length": 235.359375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3029787242412567, + "epoch": 0.6262254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4808825125445177, + "kl": 0.041609782725572586, + "learning_rate": 9.650454196437973e-07, + "loss": -0.0542, + "num_tokens": 16024591.0, + "reward": -0.03125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6658228635787964, + "sampling/importance_sampling_ratio/mean": 0.9998339414596558, + "sampling/importance_sampling_ratio/min": 0.5049895644187927, + "sampling/sampling_logp_difference/max": 0.6832175254821777, + "sampling/sampling_logp_difference/mean": 0.015195919200778008, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 198.53125, + "completions/mean_terminated_length": 198.53125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.24742618203163147, + "epoch": 0.6274509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4148743369693726, + "kl": 0.04469574987888336, + "learning_rate": 9.647832697318206e-07, + "loss": -0.0317, + "num_tokens": 16057089.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003695487976074, + "sampling/importance_sampling_ratio/min": 0.23465335369110107, + "sampling/sampling_logp_difference/max": 1.44964599609375, + "sampling/sampling_logp_difference/mean": 0.015340734273195267, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 254.765625, + "completions/mean_terminated_length": 254.765625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2867315411567688, + "epoch": 0.6286764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5370886558265742, + "kl": 0.05973536893725395, + "learning_rate": 9.645201763392513e-07, + "loss": 0.0284, + "num_tokens": 16091170.0, + "reward": 0.53125, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996262788772583, + "sampling/importance_sampling_ratio/min": 0.4404844045639038, + "sampling/sampling_logp_difference/max": 0.8198802471160889, + "sampling/sampling_logp_difference/mean": 0.015395834110677242, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 154.671875, + "completions/mean_terminated_length": 154.671875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.16976070404052734, + "epoch": 0.6299019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.388522938175457, + "kl": 0.05982288718223572, + "learning_rate": 9.64256140000152e-07, + "loss": -0.0058, + "num_tokens": 16119709.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006251335144043, + "sampling/importance_sampling_ratio/min": 0.514415442943573, + "sampling/sampling_logp_difference/max": 0.7173492908477783, + "sampling/sampling_logp_difference/mean": 0.013128705322742462, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 198.609375, + "completions/mean_terminated_length": 198.609375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.21194711327552795, + "epoch": 0.6311274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1475040650361064, + "kl": 0.05573735758662224, + "learning_rate": 9.639911612505003e-07, + "loss": 0.0451, + "num_tokens": 16153252.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5551807880401611, + "sampling/importance_sampling_ratio/mean": 1.0002050399780273, + "sampling/importance_sampling_ratio/min": 0.3899836838245392, + "sampling/sampling_logp_difference/max": 0.941650390625, + "sampling/sampling_logp_difference/mean": 0.014565443620085716, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 181.421875, + "completions/mean_terminated_length": 181.421875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.20764178037643433, + "epoch": 0.6323529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1720468058929345, + "kl": 0.05153400078415871, + "learning_rate": 9.63725240628186e-07, + "loss": -0.0403, + "num_tokens": 16182751.0, + "reward": -0.09375, + "reward_std": 0.497555673122406, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004583597183228, + "sampling/importance_sampling_ratio/min": 0.6154090762138367, + "sampling/sampling_logp_difference/max": 0.8144304752349854, + "sampling/sampling_logp_difference/mean": 0.012461901642382145, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 176.109375, + "completions/mean_terminated_length": 176.109375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.27043965458869934, + "epoch": 0.633578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9558631710433498, + "kl": 0.07393071800470352, + "learning_rate": 9.634583786730108e-07, + "loss": 0.0777, + "num_tokens": 16212310.0, + "reward": 0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6234568357467651, + "sampling/importance_sampling_ratio/mean": 1.0003788471221924, + "sampling/importance_sampling_ratio/min": 0.4622943103313446, + "sampling/sampling_logp_difference/max": 0.7715535163879395, + "sampling/sampling_logp_difference/mean": 0.01648622192442417, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 245.734375, + "completions/mean_terminated_length": 245.734375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.18301594257354736, + "epoch": 0.6348039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03779742217316824, + "kl": 0.0384780615568161, + "learning_rate": 9.63190575926688e-07, + "loss": 0.0003, + "num_tokens": 16246037.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5963561534881592, + "sampling/importance_sampling_ratio/mean": 0.9991670250892639, + "sampling/importance_sampling_ratio/min": 0.5856403708457947, + "sampling/sampling_logp_difference/max": 0.5350494384765625, + "sampling/sampling_logp_difference/mean": 0.012074870988726616, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 210.0625, + "completions/mean_terminated_length": 210.0625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.3609851002693176, + "epoch": 0.6360294117647058, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6217027238359476, + "kl": 0.06879754364490509, + "learning_rate": 9.6292183293284e-07, + "loss": -0.0125, + "num_tokens": 16279049.0, + "reward": 0.0, + "reward_std": 0.6143567562103271, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5075724124908447, + "sampling/importance_sampling_ratio/mean": 1.000047206878662, + "sampling/importance_sampling_ratio/min": 0.5625096559524536, + "sampling/sampling_logp_difference/max": 0.5753469467163086, + "sampling/sampling_logp_difference/mean": 0.017157625406980515, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 298.671875, + "completions/mean_terminated_length": 298.671875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.2705211043357849, + "epoch": 0.6372549019607843, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.709078537110251, + "kl": 0.044981539249420166, + "learning_rate": 9.626521502369983e-07, + "loss": 0.0487, + "num_tokens": 16311732.0, + "reward": 0.46875, + "reward_std": 0.5722135901451111, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.818015694618225, + "sampling/importance_sampling_ratio/mean": 0.999798595905304, + "sampling/importance_sampling_ratio/min": 0.12197814881801605, + "sampling/sampling_logp_difference/max": 2.1039133071899414, + "sampling/sampling_logp_difference/mean": 0.013444855809211731, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 224.296875, + "completions/mean_terminated_length": 224.296875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.23086273670196533, + "epoch": 0.6384803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.002731924335962, + "kl": 0.05612014979124069, + "learning_rate": 9.623815283866015e-07, + "loss": -0.018, + "num_tokens": 16342279.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.7553222179412842, + "sampling/importance_sampling_ratio/mean": 0.9998987317085266, + "sampling/importance_sampling_ratio/min": 0.510368287563324, + "sampling/sampling_logp_difference/max": 0.6726226806640625, + "sampling/sampling_logp_difference/mean": 0.013258611783385277, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 148.21875, + "completions/mean_terminated_length": 148.21875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.24009597301483154, + "epoch": 0.6397058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.865036848073618, + "kl": 0.08404193073511124, + "learning_rate": 9.621099679309946e-07, + "loss": 0.0149, + "num_tokens": 16368309.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000610589981079, + "sampling/importance_sampling_ratio/min": 0.591404139995575, + "sampling/sampling_logp_difference/max": 0.7522697448730469, + "sampling/sampling_logp_difference/mean": 0.015585711225867271, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 177.28125, + "completions/mean_terminated_length": 177.28125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.22186435759067535, + "epoch": 0.6409313725490197, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.80332290150908, + "kl": 0.08081275224685669, + "learning_rate": 9.618374694214285e-07, + "loss": 0.0786, + "num_tokens": 16393751.0, + "reward": 0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.5947540998458862, + "sampling/importance_sampling_ratio/mean": 1.0002617835998535, + "sampling/importance_sampling_ratio/min": 0.6109259128570557, + "sampling/sampling_logp_difference/max": 0.49277955293655396, + "sampling/sampling_logp_difference/mean": 0.013700183480978012, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 191.1875, + "completions/mean_terminated_length": 191.1875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.35577911138534546, + "epoch": 0.6421568627450981, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4853327131233844, + "kl": 0.10113876312971115, + "learning_rate": 9.615640334110578e-07, + "loss": -0.0734, + "num_tokens": 16428339.0, + "reward": 0.09375, + "reward_std": 0.7931214570999146, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000748872756958, + "sampling/importance_sampling_ratio/min": 0.5403655171394348, + "sampling/sampling_logp_difference/max": 0.842801570892334, + "sampling/sampling_logp_difference/mean": 0.018896231427788734, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 147.40625, + "completions/mean_terminated_length": 147.40625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.2719324231147766, + "epoch": 0.6433823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6063992458995404, + "kl": 0.0841350331902504, + "learning_rate": 9.612896604549401e-07, + "loss": -0.0053, + "num_tokens": 16452637.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.826122760772705, + "sampling/importance_sampling_ratio/mean": 1.000278115272522, + "sampling/importance_sampling_ratio/min": 0.6413664221763611, + "sampling/sampling_logp_difference/max": 0.6021950244903564, + "sampling/sampling_logp_difference/mean": 0.01662953570485115, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 169.484375, + "completions/mean_terminated_length": 169.484375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2781379222869873, + "epoch": 0.6446078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1459635014793854, + "kl": 0.0846191793680191, + "learning_rate": 9.610143511100354e-07, + "loss": 0.0113, + "num_tokens": 16478364.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.65408456325531, + "sampling/importance_sampling_ratio/mean": 1.0004284381866455, + "sampling/importance_sampling_ratio/min": 0.5941311717033386, + "sampling/sampling_logp_difference/max": 0.5206551551818848, + "sampling/sampling_logp_difference/mean": 0.015035307966172695, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 189.625, + "completions/mean_terminated_length": 189.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.27831029891967773, + "epoch": 0.6458333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.161012926026988, + "kl": 0.10464677959680557, + "learning_rate": 9.607381059352038e-07, + "loss": 0.0051, + "num_tokens": 16510692.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5662132501602173, + "sampling/importance_sampling_ratio/mean": 0.9999711513519287, + "sampling/importance_sampling_ratio/min": 0.5038020610809326, + "sampling/sampling_logp_difference/max": 0.6855719089508057, + "sampling/sampling_logp_difference/mean": 0.016203096136450768, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 198.09375, + "completions/mean_terminated_length": 198.09375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.2400476336479187, + "epoch": 0.6470588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2877264244056412, + "kl": 0.07220980525016785, + "learning_rate": 9.60460925491206e-07, + "loss": -0.0051, + "num_tokens": 16543786.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6094046831130981, + "sampling/importance_sampling_ratio/mean": 0.9998757839202881, + "sampling/importance_sampling_ratio/min": 0.5140842199325562, + "sampling/sampling_logp_difference/max": 0.6653681993484497, + "sampling/sampling_logp_difference/mean": 0.014955186285078526, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 167.28125, + "completions/mean_terminated_length": 167.28125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2575138807296753, + "epoch": 0.6482843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3237644518946134, + "kl": 0.07371678948402405, + "learning_rate": 9.601828103407004e-07, + "loss": -0.008, + "num_tokens": 16575372.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997285008430481, + "sampling/importance_sampling_ratio/min": 0.5245344042778015, + "sampling/sampling_logp_difference/max": 0.8339135646820068, + "sampling/sampling_logp_difference/mean": 0.01670641452074051, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 183.578125, + "completions/mean_terminated_length": 183.578125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.23362132906913757, + "epoch": 0.6495098039215687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06795336830070159, + "kl": 0.07781791687011719, + "learning_rate": 9.599037610482433e-07, + "loss": 0.0007, + "num_tokens": 16604929.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000827312469482, + "sampling/importance_sampling_ratio/min": 0.4381314516067505, + "sampling/sampling_logp_difference/max": 0.8252363204956055, + "sampling/sampling_logp_difference/mean": 0.014331628568470478, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 193.4375, + "completions/mean_terminated_length": 193.4375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2269354909658432, + "epoch": 0.6507352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05150817261310415, + "kl": 0.05640610307455063, + "learning_rate": 9.59623778180287e-07, + "loss": 0.0006, + "num_tokens": 16635693.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002391338348389, + "sampling/importance_sampling_ratio/min": 0.4477214515209198, + "sampling/sampling_logp_difference/max": 0.8460354804992676, + "sampling/sampling_logp_difference/mean": 0.014656702056527138, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 212.6875, + "completions/mean_terminated_length": 212.6875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.29390576481819153, + "epoch": 0.6519607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9920082670028044, + "kl": 0.1033441424369812, + "learning_rate": 9.593428623051791e-07, + "loss": 0.0022, + "num_tokens": 16667257.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5775432586669922, + "sampling/importance_sampling_ratio/mean": 1.0003143548965454, + "sampling/importance_sampling_ratio/min": 0.29912951588630676, + "sampling/sampling_logp_difference/max": 1.206878662109375, + "sampling/sampling_logp_difference/mean": 0.016023050993680954, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 180.5, + "completions/mean_terminated_length": 180.5, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2245170921087265, + "epoch": 0.6531862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06037153225127418, + "kl": 0.06282234191894531, + "learning_rate": 9.59061013993161e-07, + "loss": 0.0006, + "num_tokens": 16700473.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5690274238586426, + "sampling/importance_sampling_ratio/mean": 0.999611496925354, + "sampling/importance_sampling_ratio/min": 0.5601468682289124, + "sampling/sampling_logp_difference/max": 0.5795562267303467, + "sampling/sampling_logp_difference/mean": 0.014398027211427689, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 245.921875, + "completions/mean_terminated_length": 245.921875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.2711595892906189, + "epoch": 0.6544117647058824, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9371063223721268, + "kl": 0.08010272681713104, + "learning_rate": 9.587782338163667e-07, + "loss": 0.0662, + "num_tokens": 16735188.0, + "reward": 0.8125, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.862383484840393, + "sampling/importance_sampling_ratio/mean": 1.000108242034912, + "sampling/importance_sampling_ratio/min": 0.2419363260269165, + "sampling/sampling_logp_difference/max": 1.4190807342529297, + "sampling/sampling_logp_difference/mean": 0.015247123315930367, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 214.09375, + "completions/mean_terminated_length": 214.09375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.29227620363235474, + "epoch": 0.6556372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05571439573830488, + "kl": 0.091652050614357, + "learning_rate": 9.584945223488226e-07, + "loss": 0.0009, + "num_tokens": 16769834.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6192150115966797, + "sampling/importance_sampling_ratio/mean": 1.0003212690353394, + "sampling/importance_sampling_ratio/min": 0.564578652381897, + "sampling/sampling_logp_difference/max": 0.5716755390167236, + "sampling/sampling_logp_difference/mean": 0.015373600646853447, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 183.953125, + "completions/mean_terminated_length": 183.953125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.24862109124660492, + "epoch": 0.6568627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05687396399793213, + "kl": 0.09466812759637833, + "learning_rate": 9.582098801664443e-07, + "loss": 0.0009, + "num_tokens": 16801031.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.500941514968872, + "sampling/importance_sampling_ratio/mean": 0.9996380805969238, + "sampling/importance_sampling_ratio/min": 0.6125088334083557, + "sampling/sampling_logp_difference/max": 0.4901919364929199, + "sampling/sampling_logp_difference/mean": 0.0137968510389328, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 259.3125, + "completions/mean_terminated_length": 259.3125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.2820940315723419, + "epoch": 0.6580882352941176, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1695247246060607, + "kl": 0.06301745027303696, + "learning_rate": 9.579243078470378e-07, + "loss": -0.0069, + "num_tokens": 16838539.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6039164066314697, + "sampling/importance_sampling_ratio/mean": 0.9995141625404358, + "sampling/importance_sampling_ratio/min": 0.16875283420085907, + "sampling/sampling_logp_difference/max": 1.779320240020752, + "sampling/sampling_logp_difference/mean": 0.014724900014698505, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 219.390625, + "completions/mean_terminated_length": 219.390625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.39771369099617004, + "epoch": 0.6593137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.642792632565206, + "kl": 0.12054871767759323, + "learning_rate": 9.576378059702968e-07, + "loss": 0.0329, + "num_tokens": 16873700.0, + "reward": 0.3125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5629504919052124, + "sampling/importance_sampling_ratio/mean": 0.9997038841247559, + "sampling/importance_sampling_ratio/min": 0.5489786863327026, + "sampling/sampling_logp_difference/max": 0.5996956825256348, + "sampling/sampling_logp_difference/mean": 0.019365711137652397, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 214.25, + "completions/mean_terminated_length": 214.25, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2835337817668915, + "epoch": 0.6605392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10563928191472281, + "kl": 0.1101784035563469, + "learning_rate": 9.573503751178018e-07, + "loss": 0.001, + "num_tokens": 16906708.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8463019132614136, + "sampling/importance_sampling_ratio/mean": 0.9996192455291748, + "sampling/importance_sampling_ratio/min": 0.5425854921340942, + "sampling/sampling_logp_difference/max": 0.6131846904754639, + "sampling/sampling_logp_difference/mean": 0.016335483640432358, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 199.859375, + "completions/mean_terminated_length": 199.859375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.37571844458580017, + "epoch": 0.6617647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2575798644483294, + "kl": 0.10863693803548813, + "learning_rate": 9.570620158730194e-07, + "loss": 0.0054, + "num_tokens": 16946395.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7116420269012451, + "sampling/importance_sampling_ratio/mean": 0.9990810751914978, + "sampling/importance_sampling_ratio/min": 0.4442260265350342, + "sampling/sampling_logp_difference/max": 0.8114218711853027, + "sampling/sampling_logp_difference/mean": 0.0193776898086071, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 180.265625, + "completions/mean_terminated_length": 180.265625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.22068551182746887, + "epoch": 0.6629901960784313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06815042504597542, + "kl": 0.08907188475131989, + "learning_rate": 9.567727288213004e-07, + "loss": 0.0008, + "num_tokens": 16975404.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6308956146240234, + "sampling/importance_sampling_ratio/mean": 1.000535011291504, + "sampling/importance_sampling_ratio/min": 0.6361169815063477, + "sampling/sampling_logp_difference/max": 0.48912930488586426, + "sampling/sampling_logp_difference/mean": 0.013500725850462914, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 178.359375, + "completions/mean_terminated_length": 178.359375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2912602722644806, + "epoch": 0.6642156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8030424189206746, + "kl": 0.136108860373497, + "learning_rate": 9.564825145498793e-07, + "loss": -0.0412, + "num_tokens": 17004595.0, + "reward": 0.15625, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.872259497642517, + "sampling/importance_sampling_ratio/mean": 0.9996621608734131, + "sampling/importance_sampling_ratio/min": 0.294066458940506, + "sampling/sampling_logp_difference/max": 1.2239494323730469, + "sampling/sampling_logp_difference/mean": 0.016193510964512825, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 164.578125, + "completions/mean_terminated_length": 164.578125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.31142550706863403, + "epoch": 0.6654411764705882, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1293985363764585, + "kl": 0.1263895183801651, + "learning_rate": 9.561913736478728e-07, + "loss": 0.0597, + "num_tokens": 17035528.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996033310890198, + "sampling/importance_sampling_ratio/min": 0.4838782846927643, + "sampling/sampling_logp_difference/max": 0.8160734176635742, + "sampling/sampling_logp_difference/mean": 0.0175727941095829, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 191.0625, + "completions/mean_terminated_length": 191.0625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.25655582547187805, + "epoch": 0.6666666666666666, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4997438922921271, + "kl": 0.07254540920257568, + "learning_rate": 9.558993067062784e-07, + "loss": 0.0149, + "num_tokens": 17064764.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6207282543182373, + "sampling/importance_sampling_ratio/mean": 1.0000991821289062, + "sampling/importance_sampling_ratio/min": 0.4860598146915436, + "sampling/sampling_logp_difference/max": 0.7214236259460449, + "sampling/sampling_logp_difference/mean": 0.015043018385767937, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 247.46875, + "completions/mean_terminated_length": 247.46875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.29953157901763916, + "epoch": 0.6678921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040827915306439845, + "kl": 0.06826378405094147, + "learning_rate": 9.556063143179735e-07, + "loss": 0.0006, + "num_tokens": 17104554.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003316402435303, + "sampling/importance_sampling_ratio/min": 0.2916131317615509, + "sampling/sampling_logp_difference/max": 1.2323272228240967, + "sampling/sampling_logp_difference/mean": 0.01632849872112274, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 179.65625, + "completions/mean_terminated_length": 179.65625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.30334335565567017, + "epoch": 0.6691176470588235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06335981783002397, + "kl": 0.10095895081758499, + "learning_rate": 9.55312397077714e-07, + "loss": 0.001, + "num_tokens": 17132644.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006049871444702, + "sampling/importance_sampling_ratio/min": 0.6039285659790039, + "sampling/sampling_logp_difference/max": 0.7801837921142578, + "sampling/sampling_logp_difference/mean": 0.01764393225312233, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1079.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 227.8125, + "completions/mean_terminated_length": 227.8125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.2977250814437866, + "epoch": 0.6703431372549019, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.503040894183228, + "kl": 0.08063647150993347, + "learning_rate": 9.550175555821334e-07, + "loss": 0.1021, + "num_tokens": 17166632.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.738738775253296, + "sampling/importance_sampling_ratio/mean": 1.000267505645752, + "sampling/importance_sampling_ratio/min": 0.5096349120140076, + "sampling/sampling_logp_difference/max": 0.674060583114624, + "sampling/sampling_logp_difference/mean": 0.015589935705065727, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 161.9375, + "completions/mean_terminated_length": 161.9375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.33991408348083496, + "epoch": 0.6715686274509803, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2215866275205087, + "kl": 0.14120438694953918, + "learning_rate": 9.547217904297409e-07, + "loss": -0.049, + "num_tokens": 17194900.0, + "reward": 0.71875, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000576972961426, + "sampling/importance_sampling_ratio/min": 0.35791149735450745, + "sampling/sampling_logp_difference/max": 1.0274696350097656, + "sampling/sampling_logp_difference/mean": 0.01935707777738571, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 175.328125, + "completions/mean_terminated_length": 175.328125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3658367693424225, + "epoch": 0.6727941176470589, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0852632848722275, + "kl": 0.11114779859781265, + "learning_rate": 9.544251022209216e-07, + "loss": 0.0552, + "num_tokens": 17229657.0, + "reward": 0.6875, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996533989906311, + "sampling/importance_sampling_ratio/min": 0.37608960270881653, + "sampling/sampling_logp_difference/max": 0.9779279232025146, + "sampling/sampling_logp_difference/mean": 0.020111430436372757, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 168.359375, + "completions/mean_terminated_length": 168.359375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.22088299691677094, + "epoch": 0.6740196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05858628617393321, + "kl": 0.06984078884124756, + "learning_rate": 9.541274915579334e-07, + "loss": 0.0007, + "num_tokens": 17255808.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999707818031311, + "sampling/importance_sampling_ratio/min": 0.36338913440704346, + "sampling/sampling_logp_difference/max": 1.012281060218811, + "sampling/sampling_logp_difference/mean": 0.013919494114816189, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 199.28125, + "completions/mean_terminated_length": 199.28125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.28910544514656067, + "epoch": 0.6752450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.517878511161227, + "kl": 0.13955840468406677, + "learning_rate": 9.538289590449071e-07, + "loss": 0.0013, + "num_tokens": 17288018.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7042194604873657, + "sampling/importance_sampling_ratio/mean": 0.9997285008430481, + "sampling/importance_sampling_ratio/min": 0.4405342936515808, + "sampling/sampling_logp_difference/max": 0.8197669982910156, + "sampling/sampling_logp_difference/mean": 0.015938732773065567, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 213.21875, + "completions/mean_terminated_length": 213.21875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.29339146614074707, + "epoch": 0.6764705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048436218632843996, + "kl": 0.0689179003238678, + "learning_rate": 9.535295052878449e-07, + "loss": 0.0007, + "num_tokens": 17321552.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6922236680984497, + "sampling/importance_sampling_ratio/mean": 1.0000426769256592, + "sampling/importance_sampling_ratio/min": 0.5164052844047546, + "sampling/sampling_logp_difference/max": 0.6608633995056152, + "sampling/sampling_logp_difference/mean": 0.015585601329803467, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 138.5, + "completions/mean_terminated_length": 138.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.21433620154857635, + "epoch": 0.6776960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07873846618363181, + "kl": 0.08914574980735779, + "learning_rate": 9.53229130894619e-07, + "loss": 0.0009, + "num_tokens": 17346672.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8140681982040405, + "sampling/importance_sampling_ratio/mean": 0.999647855758667, + "sampling/importance_sampling_ratio/min": 0.48965176939964294, + "sampling/sampling_logp_difference/max": 0.7140607833862305, + "sampling/sampling_logp_difference/mean": 0.015001345425844193, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 162.78125, + "completions/mean_terminated_length": 162.78125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.2795228362083435, + "epoch": 0.678921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10242324308658463, + "kl": 0.08963474631309509, + "learning_rate": 9.529278364749702e-07, + "loss": 0.0008, + "num_tokens": 17377138.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5608521699905396, + "sampling/importance_sampling_ratio/mean": 1.0004630088806152, + "sampling/importance_sampling_ratio/min": 0.4826413094997406, + "sampling/sampling_logp_difference/max": 0.7284815311431885, + "sampling/sampling_logp_difference/mean": 0.015321934595704079, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 185.859375, + "completions/mean_terminated_length": 185.859375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.32190006971359253, + "epoch": 0.6801470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.380334760732346, + "kl": 0.08094222843647003, + "learning_rate": 9.526256226405073e-07, + "loss": 0.0372, + "num_tokens": 17405545.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.39084792137146, + "sampling/importance_sampling_ratio/mean": 0.9996212720870972, + "sampling/importance_sampling_ratio/min": 0.578363299369812, + "sampling/sampling_logp_difference/max": 0.5475530624389648, + "sampling/sampling_logp_difference/mean": 0.015896037220954895, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 184.546875, + "completions/mean_terminated_length": 184.546875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.27940815687179565, + "epoch": 0.6813725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.246729190634135, + "kl": 0.08324264734983444, + "learning_rate": 9.523224900047051e-07, + "loss": 0.0589, + "num_tokens": 17431420.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.655898928642273, + "sampling/importance_sampling_ratio/mean": 1.0001144409179688, + "sampling/importance_sampling_ratio/min": 0.48278486728668213, + "sampling/sampling_logp_difference/max": 0.728184163570404, + "sampling/sampling_logp_difference/mean": 0.015265392139554024, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 171.71875, + "completions/mean_terminated_length": 171.71875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2746974527835846, + "epoch": 0.6825980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0743775941752993, + "kl": 0.18216821551322937, + "learning_rate": 9.520184391829036e-07, + "loss": 0.0021, + "num_tokens": 17463418.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6304324865341187, + "sampling/importance_sampling_ratio/mean": 0.9998118877410889, + "sampling/importance_sampling_ratio/min": 0.5492480993270874, + "sampling/sampling_logp_difference/max": 0.5992050170898438, + "sampling/sampling_logp_difference/mean": 0.015748387202620506, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 180.515625, + "completions/mean_terminated_length": 180.515625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.32589060068130493, + "epoch": 0.6838235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6425778279453724, + "kl": 0.0704319179058075, + "learning_rate": 9.517134707923069e-07, + "loss": -0.0216, + "num_tokens": 17495099.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.7607169151306152, + "sampling/importance_sampling_ratio/mean": 1.000322937965393, + "sampling/importance_sampling_ratio/min": 0.29270172119140625, + "sampling/sampling_logp_difference/max": 1.228601098060608, + "sampling/sampling_logp_difference/mean": 0.017600977793335915, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 181.765625, + "completions/mean_terminated_length": 181.765625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.23727193474769592, + "epoch": 0.6850490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05431430416722844, + "kl": 0.07662207633256912, + "learning_rate": 9.514075854519813e-07, + "loss": 0.0007, + "num_tokens": 17523196.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.715319275856018, + "sampling/importance_sampling_ratio/mean": 0.9998581409454346, + "sampling/importance_sampling_ratio/min": 0.5144228339195251, + "sampling/sampling_logp_difference/max": 0.6647096872329712, + "sampling/sampling_logp_difference/mean": 0.01397109217941761, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 166.921875, + "completions/mean_terminated_length": 166.921875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3472491502761841, + "epoch": 0.6862745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.457981257648886, + "kl": 0.10838024318218231, + "learning_rate": 9.511007837828548e-07, + "loss": -0.0213, + "num_tokens": 17556007.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.957856297492981, + "sampling/importance_sampling_ratio/mean": 1.0007283687591553, + "sampling/importance_sampling_ratio/min": 0.5365152359008789, + "sampling/sampling_logp_difference/max": 0.6718502044677734, + "sampling/sampling_logp_difference/mean": 0.017687536776065826, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 252.296875, + "completions/mean_terminated_length": 252.296875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.3433667719364166, + "epoch": 0.6875, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1470517825611433, + "kl": 0.0592065192759037, + "learning_rate": 9.507930664077153e-07, + "loss": -0.0069, + "num_tokens": 17595114.0, + "reward": -0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.9486160278320312, + "sampling/importance_sampling_ratio/mean": 1.0003085136413574, + "sampling/importance_sampling_ratio/min": 0.5001212358474731, + "sampling/sampling_logp_difference/max": 0.6929047107696533, + "sampling/sampling_logp_difference/mean": 0.01738228276371956, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 183.921875, + "completions/mean_terminated_length": 183.921875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2436867654323578, + "epoch": 0.6887254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05303062558787955, + "kl": 0.07596056163311005, + "learning_rate": 9.504844339512094e-07, + "loss": 0.0007, + "num_tokens": 17626421.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009114742279053, + "sampling/importance_sampling_ratio/min": 0.29601848125457764, + "sampling/sampling_logp_difference/max": 1.3902349472045898, + "sampling/sampling_logp_difference/mean": 0.01490679569542408, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 172.3125, + "completions/mean_terminated_length": 172.3125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.32237541675567627, + "epoch": 0.6899509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.237576665563323, + "kl": 0.10190116614103317, + "learning_rate": 9.501748870398419e-07, + "loss": 0.0389, + "num_tokens": 17654809.0, + "reward": -0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.689343810081482, + "sampling/importance_sampling_ratio/mean": 0.9994897246360779, + "sampling/importance_sampling_ratio/min": 0.482289582490921, + "sampling/sampling_logp_difference/max": 0.729210615158081, + "sampling/sampling_logp_difference/mean": 0.017055770382285118, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 189.640625, + "completions/mean_terminated_length": 189.640625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.1565149873495102, + "epoch": 0.6911764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4900637771554077, + "kl": 0.059271518141031265, + "learning_rate": 9.498644263019731e-07, + "loss": 0.0301, + "num_tokens": 17686738.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5005675554275513, + "sampling/importance_sampling_ratio/mean": 0.9995347261428833, + "sampling/importance_sampling_ratio/min": 0.1834573894739151, + "sampling/sampling_logp_difference/max": 1.6957728862762451, + "sampling/sampling_logp_difference/mean": 0.011621063575148582, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 176.0625, + "completions/mean_terminated_length": 176.0625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4577755033969879, + "epoch": 0.6924019607843137, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.612417444906285, + "kl": 0.18480560183525085, + "learning_rate": 9.495530523678186e-07, + "loss": -0.0051, + "num_tokens": 17717606.0, + "reward": 0.0625, + "reward_std": 0.9955304861068726, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7520650625228882, + "sampling/importance_sampling_ratio/mean": 1.000302791595459, + "sampling/importance_sampling_ratio/min": 0.6144919395446777, + "sampling/sampling_logp_difference/max": 0.5607950687408447, + "sampling/sampling_logp_difference/mean": 0.02176954783499241, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 188.953125, + "completions/mean_terminated_length": 188.953125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2748880982398987, + "epoch": 0.6936274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5379158429291662, + "kl": 0.08798833191394806, + "learning_rate": 9.492407658694477e-07, + "loss": 0.0107, + "num_tokens": 17745587.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000697374343872, + "sampling/importance_sampling_ratio/min": 0.6073539853096008, + "sampling/sampling_logp_difference/max": 1.2174501419067383, + "sampling/sampling_logp_difference/mean": 0.016703777015209198, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 170.4375, + "completions/mean_terminated_length": 170.4375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2802906632423401, + "epoch": 0.6948529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1508284482199413, + "kl": 0.09377346932888031, + "learning_rate": 9.489275674407825e-07, + "loss": -0.015, + "num_tokens": 17773279.0, + "reward": 0.78125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.595808982849121, + "sampling/importance_sampling_ratio/mean": 0.9997985363006592, + "sampling/importance_sampling_ratio/min": 0.5319787859916687, + "sampling/sampling_logp_difference/max": 0.6311516761779785, + "sampling/sampling_logp_difference/mean": 0.015627920627593994, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 142.046875, + "completions/mean_terminated_length": 142.046875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.20774143934249878, + "epoch": 0.696078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06676551202190348, + "kl": 0.06828676909208298, + "learning_rate": 9.486134577175957e-07, + "loss": 0.0006, + "num_tokens": 17796754.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5373115539550781, + "sampling/importance_sampling_ratio/mean": 1.0002179145812988, + "sampling/importance_sampling_ratio/min": 0.6622359156608582, + "sampling/sampling_logp_difference/max": 0.4300351142883301, + "sampling/sampling_logp_difference/mean": 0.013358856551349163, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 198.96875, + "completions/mean_terminated_length": 198.96875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3348020017147064, + "epoch": 0.6973039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6026369656150075, + "kl": 0.08831897377967834, + "learning_rate": 9.482984373375104e-07, + "loss": 0.036, + "num_tokens": 17829024.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999412298202515, + "sampling/importance_sampling_ratio/min": 0.600390613079071, + "sampling/sampling_logp_difference/max": 0.930025577545166, + "sampling/sampling_logp_difference/mean": 0.018753811717033386, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 184.625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3299071788787842, + "epoch": 0.6985294117647058, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1039569568245162, + "kl": 0.12124665826559067, + "learning_rate": 9.479825069399977e-07, + "loss": 0.0055, + "num_tokens": 17858568.0, + "reward": 0.03125, + "reward_std": 0.5143726468086243, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9641410112380981, + "sampling/importance_sampling_ratio/mean": 1.00050950050354, + "sampling/importance_sampling_ratio/min": 0.290988564491272, + "sampling/sampling_logp_difference/max": 1.234471321105957, + "sampling/sampling_logp_difference/mean": 0.01924917846918106, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 171.0625, + "completions/mean_terminated_length": 171.0625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.28250452876091003, + "epoch": 0.6997549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7806336214041554, + "kl": 0.09873203933238983, + "learning_rate": 9.476656671663766e-07, + "loss": -0.0054, + "num_tokens": 17891580.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.8723371028900146, + "sampling/importance_sampling_ratio/mean": 0.9993788599967957, + "sampling/importance_sampling_ratio/min": 0.6087372899055481, + "sampling/sampling_logp_difference/max": 0.6271874904632568, + "sampling/sampling_logp_difference/mean": 0.01713189110159874, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 182.96875, + "completions/mean_terminated_length": 182.96875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2556510269641876, + "epoch": 0.7009803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5571421400333714, + "kl": 0.07203780114650726, + "learning_rate": 9.473479186598114e-07, + "loss": -0.0093, + "num_tokens": 17919642.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000793218612671, + "sampling/importance_sampling_ratio/min": 0.5747721791267395, + "sampling/sampling_logp_difference/max": 0.9725565910339355, + "sampling/sampling_logp_difference/mean": 0.017283430323004723, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 195.015625, + "completions/mean_terminated_length": 195.015625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.22532916069030762, + "epoch": 0.7022058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.068594738552317, + "kl": 0.06998701393604279, + "learning_rate": 9.470292620653119e-07, + "loss": -0.0038, + "num_tokens": 17949115.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8179765939712524, + "sampling/importance_sampling_ratio/mean": 1.0004429817199707, + "sampling/importance_sampling_ratio/min": 0.4138515293598175, + "sampling/sampling_logp_difference/max": 0.882248044013977, + "sampling/sampling_logp_difference/mean": 0.013591472990810871, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 214.078125, + "completions/mean_terminated_length": 214.078125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.37085291743278503, + "epoch": 0.7034313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3661480693610237, + "kl": 0.10367009043693542, + "learning_rate": 9.467096980297304e-07, + "loss": 0.0438, + "num_tokens": 17980880.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009095668792725, + "sampling/importance_sampling_ratio/min": 0.32664066553115845, + "sampling/sampling_logp_difference/max": 1.1188945770263672, + "sampling/sampling_logp_difference/mean": 0.01959863491356373, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.37925583124160767, + "epoch": 0.7046568627450981, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0154665755784262, + "kl": 0.13310685753822327, + "learning_rate": 9.463892272017618e-07, + "loss": 0.0451, + "num_tokens": 18011768.0, + "reward": 0.5625, + "reward_std": 0.784286618232727, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999893307685852, + "sampling/importance_sampling_ratio/min": 0.4033031761646271, + "sampling/sampling_logp_difference/max": 0.9080667495727539, + "sampling/sampling_logp_difference/mean": 0.01956643909215927, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 176.171875, + "completions/mean_terminated_length": 176.171875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.27574622631073, + "epoch": 0.7058823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8423241448140895, + "kl": 0.07914084196090698, + "learning_rate": 9.460678502319416e-07, + "loss": 0.0201, + "num_tokens": 18038899.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000700831413269, + "sampling/importance_sampling_ratio/min": 0.27442094683647156, + "sampling/sampling_logp_difference/max": 1.5977290868759155, + "sampling/sampling_logp_difference/mean": 0.01649722456932068, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 236.28125, + "completions/mean_terminated_length": 236.28125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.33270812034606934, + "epoch": 0.7071078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.830751486801874, + "kl": 0.08072858303785324, + "learning_rate": 9.457455677726447e-07, + "loss": -0.0964, + "num_tokens": 18075173.0, + "reward": -0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.654782772064209, + "sampling/importance_sampling_ratio/mean": 0.9999206066131592, + "sampling/importance_sampling_ratio/min": 0.5717520713806152, + "sampling/sampling_logp_difference/max": 0.5590498447418213, + "sampling/sampling_logp_difference/mean": 0.01706099882721901, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 209.65625, + "completions/mean_terminated_length": 209.65625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2930196225643158, + "epoch": 0.7083333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2614592752394247, + "kl": 0.07866720855236053, + "learning_rate": 9.454223804780841e-07, + "loss": -0.0029, + "num_tokens": 18106719.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.815672755241394, + "sampling/importance_sampling_ratio/mean": 1.0005232095718384, + "sampling/importance_sampling_ratio/min": 0.07906536757946014, + "sampling/sampling_logp_difference/max": 2.537480354309082, + "sampling/sampling_logp_difference/mean": 0.017369206994771957, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 191.25, + "completions/mean_terminated_length": 191.25, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3268337547779083, + "epoch": 0.7095588235294118, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.4297343599552055, + "kl": 0.0850410908460617, + "learning_rate": 9.450982890043094e-07, + "loss": 0.0429, + "num_tokens": 18139391.0, + "reward": -0.28125, + "reward_std": 0.565913200378418, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000206708908081, + "sampling/importance_sampling_ratio/min": 0.1372629851102829, + "sampling/sampling_logp_difference/max": 1.985856533050537, + "sampling/sampling_logp_difference/mean": 0.01785438321530819, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 248.6875, + "completions/mean_terminated_length": 248.6875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.30297189950942993, + "epoch": 0.7107843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.019871671926416, + "kl": 0.08185930550098419, + "learning_rate": 9.447732940092059e-07, + "loss": -0.0279, + "num_tokens": 18176011.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8411680459976196, + "sampling/importance_sampling_ratio/mean": 0.9997085332870483, + "sampling/importance_sampling_ratio/min": 0.23401731252670288, + "sampling/sampling_logp_difference/max": 1.4523601531982422, + "sampling/sampling_logp_difference/mean": 0.017601585015654564, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 213.140625, + "completions/mean_terminated_length": 213.140625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2970901131629944, + "epoch": 0.7120098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5351385312250692, + "kl": 0.07222087681293488, + "learning_rate": 9.444473961524927e-07, + "loss": -0.0141, + "num_tokens": 18218756.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6027271747589111, + "sampling/importance_sampling_ratio/mean": 0.9998239874839783, + "sampling/importance_sampling_ratio/min": 0.47394949197769165, + "sampling/sampling_logp_difference/max": 0.7466545104980469, + "sampling/sampling_logp_difference/mean": 0.018158020451664925, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 218.140625, + "completions/mean_terminated_length": 218.140625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2991441488265991, + "epoch": 0.7132352941176471, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1461756462892447, + "kl": 0.0762210264801979, + "learning_rate": 9.441205960957219e-07, + "loss": 0.0535, + "num_tokens": 18252589.0, + "reward": 0.125, + "reward_std": 0.6047805547714233, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005971193313599, + "sampling/importance_sampling_ratio/min": 0.41775795817375183, + "sampling/sampling_logp_difference/max": 0.8728530406951904, + "sampling/sampling_logp_difference/mean": 0.017642326653003693, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 165.265625, + "completions/mean_terminated_length": 165.265625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2318461537361145, + "epoch": 0.7144607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7609467454165173, + "kl": 0.06704086065292358, + "learning_rate": 9.43792894502277e-07, + "loss": -0.019, + "num_tokens": 18279614.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5931954383850098, + "sampling/importance_sampling_ratio/mean": 0.999550461769104, + "sampling/importance_sampling_ratio/min": 0.35211485624313354, + "sampling/sampling_logp_difference/max": 1.0437978506088257, + "sampling/sampling_logp_difference/mean": 0.01575883850455284, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 193.921875, + "completions/mean_terminated_length": 193.921875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.33422935009002686, + "epoch": 0.7156862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4824358414655476, + "kl": 0.11342482268810272, + "learning_rate": 9.434642920373713e-07, + "loss": -0.0443, + "num_tokens": 18313945.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004544258117676, + "sampling/importance_sampling_ratio/min": 0.3287467062473297, + "sampling/sampling_logp_difference/max": 1.1124677658081055, + "sampling/sampling_logp_difference/mean": 0.022340642288327217, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 186.609375, + "completions/mean_terminated_length": 186.609375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2566809058189392, + "epoch": 0.7169117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.651125667788851, + "kl": 0.17529572546482086, + "learning_rate": 9.431347893680472e-07, + "loss": -0.105, + "num_tokens": 18340672.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994093179702759, + "sampling/importance_sampling_ratio/min": 0.1798427850008011, + "sampling/sampling_logp_difference/max": 1.715672254562378, + "sampling/sampling_logp_difference/mean": 0.018325788900256157, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 199.890625, + "completions/mean_terminated_length": 199.890625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.16830769181251526, + "epoch": 0.7181372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4054503752299314, + "kl": 0.04657375067472458, + "learning_rate": 9.428043871631739e-07, + "loss": 0.0256, + "num_tokens": 18369193.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998905658721924, + "sampling/importance_sampling_ratio/min": 0.3609904646873474, + "sampling/sampling_logp_difference/max": 1.0189037322998047, + "sampling/sampling_logp_difference/mean": 0.011816881597042084, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 198.078125, + "completions/mean_terminated_length": 198.078125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3127084970474243, + "epoch": 0.7193627450980392, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.3133259272413325, + "kl": 0.07372134178876877, + "learning_rate": 9.424730860934472e-07, + "loss": -0.0365, + "num_tokens": 18405102.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.7537305355072021, + "sampling/importance_sampling_ratio/mean": 1.000502586364746, + "sampling/importance_sampling_ratio/min": 0.47892704606056213, + "sampling/sampling_logp_difference/max": 0.7362070083618164, + "sampling/sampling_logp_difference/mean": 0.01948605105280876, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 161.96875, + "completions/mean_terminated_length": 161.96875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2641754150390625, + "epoch": 0.7205882352941176, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.012368980612821, + "kl": 0.12216442823410034, + "learning_rate": 9.421408868313873e-07, + "loss": -0.0231, + "num_tokens": 18429532.0, + "reward": 0.0625, + "reward_std": 0.5765564441680908, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7469195127487183, + "sampling/importance_sampling_ratio/mean": 1.0002282857894897, + "sampling/importance_sampling_ratio/min": 0.5910896062850952, + "sampling/sampling_logp_difference/max": 0.5578539371490479, + "sampling/sampling_logp_difference/mean": 0.016571495682001114, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 192.0, + "completions/mean_terminated_length": 192.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.201697438955307, + "epoch": 0.7218137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2828015128304315, + "kl": 0.06914472579956055, + "learning_rate": 9.418077900513376e-07, + "loss": 0.0625, + "num_tokens": 18458604.0, + "reward": 0.46875, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5680956840515137, + "sampling/importance_sampling_ratio/mean": 1.0003582239151, + "sampling/importance_sampling_ratio/min": 0.5520696043968201, + "sampling/sampling_logp_difference/max": 0.5940811634063721, + "sampling/sampling_logp_difference/mean": 0.014866025187075138, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 181.921875, + "completions/mean_terminated_length": 181.921875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2416316270828247, + "epoch": 0.7230392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4748538219743468, + "kl": 0.05072364956140518, + "learning_rate": 9.414737964294634e-07, + "loss": -0.0024, + "num_tokens": 18491047.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9876954555511475, + "sampling/importance_sampling_ratio/mean": 1.0002965927124023, + "sampling/importance_sampling_ratio/min": 0.5677002668380737, + "sampling/sampling_logp_difference/max": 0.6869759559631348, + "sampling/sampling_logp_difference/mean": 0.016352690756320953, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 207.78125, + "completions/mean_terminated_length": 207.78125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.2534104585647583, + "epoch": 0.7242647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7327769190709892, + "kl": 0.1017884761095047, + "learning_rate": 9.411389066437507e-07, + "loss": -0.0358, + "num_tokens": 18525577.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.7781813144683838, + "sampling/importance_sampling_ratio/mean": 1.0002974271774292, + "sampling/importance_sampling_ratio/min": 0.46167972683906555, + "sampling/sampling_logp_difference/max": 0.7728838920593262, + "sampling/sampling_logp_difference/mean": 0.018235966563224792, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 175.75, + "completions/mean_terminated_length": 175.75, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.24245025217533112, + "epoch": 0.7254901960784313, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.0873627688625596, + "kl": 0.09672357141971588, + "learning_rate": 9.408031213740044e-07, + "loss": -0.1126, + "num_tokens": 18553737.0, + "reward": 0.1875, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.8540359735488892, + "sampling/importance_sampling_ratio/mean": 1.000643253326416, + "sampling/importance_sampling_ratio/min": 0.5420742630958557, + "sampling/sampling_logp_difference/max": 0.6173648834228516, + "sampling/sampling_logp_difference/mean": 0.01740797609090805, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 187.9375, + "completions/mean_terminated_length": 187.9375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2418367564678192, + "epoch": 0.7267156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1264170023692373, + "kl": 0.0872611477971077, + "learning_rate": 9.404664413018476e-07, + "loss": 0.0105, + "num_tokens": 18587685.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.817976713180542, + "sampling/importance_sampling_ratio/mean": 0.9995722770690918, + "sampling/importance_sampling_ratio/min": 0.3772679567337036, + "sampling/sampling_logp_difference/max": 0.974799633026123, + "sampling/sampling_logp_difference/mean": 0.01760914921760559, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 277.140625, + "completions/mean_terminated_length": 277.140625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.25757867097854614, + "epoch": 0.7279411764705882, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.101525224955409, + "kl": 0.047139450907707214, + "learning_rate": 9.401288671107193e-07, + "loss": -0.036, + "num_tokens": 18625790.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995555281639099, + "sampling/importance_sampling_ratio/min": 0.1641063690185547, + "sampling/sampling_logp_difference/max": 1.8072404861450195, + "sampling/sampling_logp_difference/mean": 0.016724945977330208, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 187.765625, + "completions/mean_terminated_length": 187.765625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2661805748939514, + "epoch": 0.7291666666666666, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2565356284367513, + "kl": 0.10957401990890503, + "learning_rate": 9.397903994858735e-07, + "loss": -0.0139, + "num_tokens": 18657599.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.8815131187438965, + "sampling/importance_sampling_ratio/mean": 1.000257968902588, + "sampling/importance_sampling_ratio/min": 0.5219682455062866, + "sampling/sampling_logp_difference/max": 0.6501485109329224, + "sampling/sampling_logp_difference/mean": 0.01742507517337799, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 200.90625, + "completions/mean_terminated_length": 200.90625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.22445990145206451, + "epoch": 0.7303921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.823977890106038, + "kl": 0.0618349090218544, + "learning_rate": 9.394510391143786e-07, + "loss": -0.0216, + "num_tokens": 18685913.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001581907272339, + "sampling/importance_sampling_ratio/min": 0.4787578880786896, + "sampling/sampling_logp_difference/max": 0.7520298957824707, + "sampling/sampling_logp_difference/mean": 0.016777168959379196, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 228.0625, + "completions/mean_terminated_length": 228.0625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.1810968518257141, + "epoch": 0.7316176470588235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049935742312966276, + "kl": 0.040459342300891876, + "learning_rate": 9.391107866851142e-07, + "loss": 0.0004, + "num_tokens": 18733485.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7553400993347168, + "sampling/importance_sampling_ratio/mean": 1.0002613067626953, + "sampling/importance_sampling_ratio/min": 0.3586792051792145, + "sampling/sampling_logp_difference/max": 1.0253269672393799, + "sampling/sampling_logp_difference/mean": 0.01264331303536892, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 182.046875, + "completions/mean_terminated_length": 182.046875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3017774820327759, + "epoch": 0.7328431372549019, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.256661925458094, + "kl": 0.10925137996673584, + "learning_rate": 9.387696428887715e-07, + "loss": 0.0262, + "num_tokens": 18759536.0, + "reward": -0.5625, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": -0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99953293800354, + "sampling/importance_sampling_ratio/min": 0.4693239629268646, + "sampling/sampling_logp_difference/max": 0.7564619779586792, + "sampling/sampling_logp_difference/mean": 0.01892591454088688, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 156.53125, + "completions/mean_terminated_length": 156.53125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.21708914637565613, + "epoch": 0.7340686274509803, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.047905324272125, + "kl": 0.10377472639083862, + "learning_rate": 9.384276084178504e-07, + "loss": 0.045, + "num_tokens": 18783746.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6252808570861816, + "sampling/importance_sampling_ratio/mean": 0.9993909597396851, + "sampling/importance_sampling_ratio/min": 0.37504148483276367, + "sampling/sampling_logp_difference/max": 0.9807186126708984, + "sampling/sampling_logp_difference/mean": 0.016367195174098015, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 210.203125, + "completions/mean_terminated_length": 210.203125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.32918208837509155, + "epoch": 0.7352941176470589, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0687708861778114, + "kl": 0.10704650729894638, + "learning_rate": 9.380846839666595e-07, + "loss": -0.0631, + "num_tokens": 18830031.0, + "reward": 0.15625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.7516393661499023, + "sampling/importance_sampling_ratio/mean": 1.0006492137908936, + "sampling/importance_sampling_ratio/min": 0.5185782313346863, + "sampling/sampling_logp_difference/max": 0.6566643714904785, + "sampling/sampling_logp_difference/mean": 0.01922888122498989, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 185.28125, + "completions/mean_terminated_length": 185.28125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2313532829284668, + "epoch": 0.7365196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8540450253251732, + "kl": 0.11107133328914642, + "learning_rate": 9.377408702313136e-07, + "loss": -0.0061, + "num_tokens": 18857905.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002211332321167, + "sampling/importance_sampling_ratio/min": 0.4707980751991272, + "sampling/sampling_logp_difference/max": 1.8231003284454346, + "sampling/sampling_logp_difference/mean": 0.016610242426395416, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 174.109375, + "completions/mean_terminated_length": 174.109375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.15651580691337585, + "epoch": 0.7377450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1056531568919814, + "kl": 0.07144558429718018, + "learning_rate": 9.37396167909733e-07, + "loss": 0.0007, + "num_tokens": 18887800.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001873970031738, + "sampling/importance_sampling_ratio/min": 0.34135210514068604, + "sampling/sampling_logp_difference/max": 1.074840784072876, + "sampling/sampling_logp_difference/mean": 0.011767935007810593, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 176.703125, + "completions/mean_terminated_length": 176.703125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.232400581240654, + "epoch": 0.7389705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.4091177250320435, + "kl": 0.09642212837934494, + "learning_rate": 9.370505777016413e-07, + "loss": -0.0364, + "num_tokens": 18915429.0, + "reward": 0.21875, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009019374847412, + "sampling/importance_sampling_ratio/min": 0.44574272632598877, + "sampling/sampling_logp_difference/max": 1.4517979621887207, + "sampling/sampling_logp_difference/mean": 0.018499933183193207, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 221.734375, + "completions/mean_terminated_length": 221.734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.22668619453907013, + "epoch": 0.7401960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3208188812787982, + "kl": 0.06442856788635254, + "learning_rate": 9.367041003085648e-07, + "loss": 0.0001, + "num_tokens": 18948612.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5386793613433838, + "sampling/importance_sampling_ratio/mean": 0.9998711347579956, + "sampling/importance_sampling_ratio/min": 0.30428776144981384, + "sampling/sampling_logp_difference/max": 1.1897814273834229, + "sampling/sampling_logp_difference/mean": 0.0150670874863863, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 247.0625, + "completions/mean_terminated_length": 247.0625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.2504423260688782, + "epoch": 0.741421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06447632382614953, + "kl": 0.054915666580200195, + "learning_rate": 9.363567364338307e-07, + "loss": 0.0005, + "num_tokens": 18986248.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994238615036011, + "sampling/importance_sampling_ratio/min": 0.3814408779144287, + "sampling/sampling_logp_difference/max": 0.9637994766235352, + "sampling/sampling_logp_difference/mean": 0.017830682918429375, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 199.828125, + "completions/mean_terminated_length": 199.828125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.20959287881851196, + "epoch": 0.7426470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6888387770785338, + "kl": 0.07850334048271179, + "learning_rate": 9.360084867825658e-07, + "loss": 0.0147, + "num_tokens": 19018877.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003209114074707, + "sampling/importance_sampling_ratio/min": 0.4849974513053894, + "sampling/sampling_logp_difference/max": 1.4400253295898438, + "sampling/sampling_logp_difference/mean": 0.015034351497888565, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 324.0, + "completions/mean_terminated_length": 324.0, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2272258996963501, + "epoch": 0.7438725490196079, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296258183615557, + "kl": 0.053450606763362885, + "learning_rate": 9.356593520616946e-07, + "loss": 0.0356, + "num_tokens": 19068989.0, + "reward": -0.78125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": -0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999680757522583, + "sampling/importance_sampling_ratio/min": 0.397955060005188, + "sampling/sampling_logp_difference/max": 1.0813994407653809, + "sampling/sampling_logp_difference/mean": 0.01458294689655304, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.21361863613128662, + "epoch": 0.7450980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07297670675866863, + "kl": 0.06689755618572235, + "learning_rate": 9.353093329799386e-07, + "loss": 0.0007, + "num_tokens": 19097077.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999480247497559, + "sampling/importance_sampling_ratio/min": 0.5174911022186279, + "sampling/sampling_logp_difference/max": 0.8329944610595703, + "sampling/sampling_logp_difference/mean": 0.013164438307285309, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 231.71875, + "completions/mean_terminated_length": 231.71875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.16736982762813568, + "epoch": 0.7463235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07469319168022485, + "kl": 0.0580403134226799, + "learning_rate": 9.349584302478144e-07, + "loss": 0.0005, + "num_tokens": 19131683.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001057386398315, + "sampling/importance_sampling_ratio/min": 0.5198130011558533, + "sampling/sampling_logp_difference/max": 0.6996743679046631, + "sampling/sampling_logp_difference/mean": 0.012384508736431599, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 231.390625, + "completions/mean_terminated_length": 231.390625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.17477071285247803, + "epoch": 0.7475490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.20948898139056, + "kl": 0.07928837090730667, + "learning_rate": 9.346066445776321e-07, + "loss": 0.0212, + "num_tokens": 19164636.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7640935182571411, + "sampling/importance_sampling_ratio/mean": 0.9995098114013672, + "sampling/importance_sampling_ratio/min": 0.5353386402130127, + "sampling/sampling_logp_difference/max": 0.6248557567596436, + "sampling/sampling_logp_difference/mean": 0.011636776849627495, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.18699999153614044, + "epoch": 0.7487745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.224725062560454, + "kl": 0.07661756873130798, + "learning_rate": 9.342539766834945e-07, + "loss": 0.0342, + "num_tokens": 19198888.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.931645393371582, + "sampling/importance_sampling_ratio/mean": 0.9998631477355957, + "sampling/importance_sampling_ratio/min": 0.3853294253349304, + "sampling/sampling_logp_difference/max": 0.9536566734313965, + "sampling/sampling_logp_difference/mean": 0.013207919895648956, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 232.734375, + "completions/mean_terminated_length": 232.734375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.1946050077676773, + "epoch": 0.75, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8619925595998692, + "kl": 0.08101022243499756, + "learning_rate": 9.339004272812949e-07, + "loss": -0.0113, + "num_tokens": 19233431.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.9876788854599, + "sampling/importance_sampling_ratio/mean": 0.9999079704284668, + "sampling/importance_sampling_ratio/min": 0.14849144220352173, + "sampling/sampling_logp_difference/max": 1.9072279930114746, + "sampling/sampling_logp_difference/mean": 0.015206235460937023, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 240.109375, + "completions/mean_terminated_length": 240.109375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.20316070318222046, + "epoch": 0.7512254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.26616001363097, + "kl": 0.06727971136569977, + "learning_rate": 9.335459970887165e-07, + "loss": 0.0262, + "num_tokens": 19265662.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.620835304260254, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.5038020610809326, + "sampling/sampling_logp_difference/max": 0.6855719089508057, + "sampling/sampling_logp_difference/mean": 0.013958821073174477, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 265.4375, + "completions/mean_terminated_length": 265.4375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2019714117050171, + "epoch": 0.7524509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06056981560619988, + "kl": 0.04912319406867027, + "learning_rate": 9.331906868252299e-07, + "loss": 0.0004, + "num_tokens": 19303882.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6355785131454468, + "sampling/importance_sampling_ratio/mean": 0.9992480874061584, + "sampling/importance_sampling_ratio/min": 0.28295519948005676, + "sampling/sampling_logp_difference/max": 1.2624666690826416, + "sampling/sampling_logp_difference/mean": 0.01610996387898922, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 265.890625, + "completions/mean_terminated_length": 265.890625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.1941600888967514, + "epoch": 0.7536764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07376953339828347, + "kl": 0.053942590951919556, + "learning_rate": 9.328344972120925e-07, + "loss": 0.0005, + "num_tokens": 19341299.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6283336877822876, + "sampling/importance_sampling_ratio/mean": 0.9999680519104004, + "sampling/importance_sampling_ratio/min": 0.267314612865448, + "sampling/sampling_logp_difference/max": 1.319329023361206, + "sampling/sampling_logp_difference/mean": 0.013370128348469734, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 282.1875, + "completions/mean_terminated_length": 282.1875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.2386167347431183, + "epoch": 0.7549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.103684839254737, + "kl": 0.04838567599654198, + "learning_rate": 9.324774289723467e-07, + "loss": 0.0036, + "num_tokens": 19380063.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9849047660827637, + "sampling/importance_sampling_ratio/mean": 0.999728798866272, + "sampling/importance_sampling_ratio/min": 0.5693269968032837, + "sampling/sampling_logp_difference/max": 0.6855709552764893, + "sampling/sampling_logp_difference/mean": 0.013577599078416824, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 256.90625, + "completions/mean_terminated_length": 256.90625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.16752922534942627, + "epoch": 0.7561274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05594665590122851, + "kl": 0.058025140315294266, + "learning_rate": 9.321194828308183e-07, + "loss": 0.0005, + "num_tokens": 19411161.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994553327560425, + "sampling/importance_sampling_ratio/min": 0.31814250349998474, + "sampling/sampling_logp_difference/max": 1.1452558040618896, + "sampling/sampling_logp_difference/mean": 0.01149724330753088, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 224.125, + "completions/mean_terminated_length": 224.125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.22526061534881592, + "epoch": 0.7573529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1830682491617488, + "kl": 0.0643705353140831, + "learning_rate": 9.317606595141155e-07, + "loss": -0.0464, + "num_tokens": 19442737.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997191429138184, + "sampling/importance_sampling_ratio/min": 0.056884463876485825, + "sampling/sampling_logp_difference/max": 2.8667330741882324, + "sampling/sampling_logp_difference/mean": 0.015809137374162674, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 256.1875, + "completions/mean_terminated_length": 256.1875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.22948342561721802, + "epoch": 0.758578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8439502527324416, + "kl": 0.0796620324254036, + "learning_rate": 9.314009597506265e-07, + "loss": 0.1217, + "num_tokens": 19473085.0, + "reward": 0.5, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000457763671875, + "sampling/importance_sampling_ratio/min": 0.3986046314239502, + "sampling/sampling_logp_difference/max": 0.9197852611541748, + "sampling/sampling_logp_difference/mean": 0.014308687299489975, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 257.828125, + "completions/mean_terminated_length": 257.828125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.25681865215301514, + "epoch": 0.7598039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1337875133983972, + "kl": 0.0655963122844696, + "learning_rate": 9.310403842705194e-07, + "loss": -0.016, + "num_tokens": 19505826.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.9782131910324097, + "sampling/importance_sampling_ratio/mean": 1.0000708103179932, + "sampling/importance_sampling_ratio/min": 0.45821645855903625, + "sampling/sampling_logp_difference/max": 0.7804136276245117, + "sampling/sampling_logp_difference/mean": 0.016288483515381813, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 333.453125, + "completions/mean_terminated_length": 333.453125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.27068203687667847, + "epoch": 0.7610294117647058, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3486866936154918, + "kl": 0.06823822855949402, + "learning_rate": 9.306789338057393e-07, + "loss": -0.0288, + "num_tokens": 19548063.0, + "reward": 0.0, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999840497970581, + "sampling/importance_sampling_ratio/min": 0.38916364312171936, + "sampling/sampling_logp_difference/max": 0.9877901077270508, + "sampling/sampling_logp_difference/mean": 0.016273140907287598, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 264.75, + "completions/mean_terminated_length": 264.75, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.18036212027072906, + "epoch": 0.7622549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14617592910948715, + "kl": 0.05190131440758705, + "learning_rate": 9.303166090900081e-07, + "loss": 0.0004, + "num_tokens": 19579535.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9100128412246704, + "sampling/importance_sampling_ratio/mean": 1.000030279159546, + "sampling/importance_sampling_ratio/min": 0.250815212726593, + "sampling/sampling_logp_difference/max": 1.3830387592315674, + "sampling/sampling_logp_difference/mean": 0.01475166529417038, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.21046610176563263, + "epoch": 0.7634803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2400533275142525, + "kl": 0.0655660331249237, + "learning_rate": 9.299534108588217e-07, + "loss": -0.002, + "num_tokens": 19614485.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.9693632125854492, + "sampling/importance_sampling_ratio/mean": 1.0004092454910278, + "sampling/importance_sampling_ratio/min": 0.5376322865486145, + "sampling/sampling_logp_difference/max": 0.6777102947235107, + "sampling/sampling_logp_difference/mean": 0.013047540560364723, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 321.328125, + "completions/mean_terminated_length": 321.328125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.15701323747634888, + "epoch": 0.7647058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043994570241456525, + "kl": 0.03948718309402466, + "learning_rate": 9.295893398494497e-07, + "loss": 0.0004, + "num_tokens": 19653674.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996031522750854, + "sampling/importance_sampling_ratio/min": 0.4808013141155243, + "sampling/sampling_logp_difference/max": 2.3391895294189453, + "sampling/sampling_logp_difference/mean": 0.010676901787519455, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 216.390625, + "completions/mean_terminated_length": 216.390625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.20857155323028564, + "epoch": 0.7659313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10756971013522826, + "kl": 0.09376202523708344, + "learning_rate": 9.29224396800933e-07, + "loss": 0.0009, + "num_tokens": 19683651.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6223664283752441, + "sampling/importance_sampling_ratio/mean": 0.9997443556785583, + "sampling/importance_sampling_ratio/min": 0.5096457600593567, + "sampling/sampling_logp_difference/max": 0.674039363861084, + "sampling/sampling_logp_difference/mean": 0.01420876570045948, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 229.78125, + "completions/mean_terminated_length": 229.78125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.22043286263942719, + "epoch": 0.7671568627450981, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3501048673191143, + "kl": 0.07209709286689758, + "learning_rate": 9.288585824540832e-07, + "loss": 0.0099, + "num_tokens": 19717621.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.9848257303237915, + "sampling/importance_sampling_ratio/mean": 0.9999575018882751, + "sampling/importance_sampling_ratio/min": 0.481048047542572, + "sampling/sampling_logp_difference/max": 0.731788158416748, + "sampling/sampling_logp_difference/mean": 0.014640593901276588, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 151.734375, + "completions/mean_terminated_length": 151.734375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.1915517896413803, + "epoch": 0.7683823529411765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08727131737687056, + "kl": 0.08875685185194016, + "learning_rate": 9.284918975514797e-07, + "loss": 0.0009, + "num_tokens": 19741828.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.804423213005066, + "sampling/importance_sampling_ratio/mean": 0.9999748468399048, + "sampling/importance_sampling_ratio/min": 0.495414674282074, + "sampling/sampling_logp_difference/max": 0.7023601531982422, + "sampling/sampling_logp_difference/mean": 0.013834046199917793, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 294.578125, + "completions/mean_terminated_length": 294.578125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.22867414355278015, + "epoch": 0.7696078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9382855513200445, + "kl": 0.07310361415147781, + "learning_rate": 9.281243428374701e-07, + "loss": 0.0048, + "num_tokens": 19774457.0, + "reward": -0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003331899642944, + "sampling/importance_sampling_ratio/min": 0.1942329853773117, + "sampling/sampling_logp_difference/max": 1.6386969089508057, + "sampling/sampling_logp_difference/mean": 0.014818103052675724, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 219.34375, + "completions/mean_terminated_length": 219.34375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.23702575266361237, + "epoch": 0.7708333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.38462137376169, + "kl": 0.06711365282535553, + "learning_rate": 9.277559190581669e-07, + "loss": -0.0636, + "num_tokens": 19812911.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6645729541778564, + "sampling/importance_sampling_ratio/mean": 1.0004074573516846, + "sampling/importance_sampling_ratio/min": 0.3960508406162262, + "sampling/sampling_logp_difference/max": 0.9262127876281738, + "sampling/sampling_logp_difference/mean": 0.014895858243107796, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 164.15625, + "completions/mean_terminated_length": 164.15625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2659897804260254, + "epoch": 0.7720588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7038503647603291, + "kl": 0.09886568784713745, + "learning_rate": 9.273866269614473e-07, + "loss": -0.0624, + "num_tokens": 19838233.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.785392165184021, + "sampling/importance_sampling_ratio/mean": 1.0003658533096313, + "sampling/importance_sampling_ratio/min": 0.5384262204170227, + "sampling/sampling_logp_difference/max": 0.6191048622131348, + "sampling/sampling_logp_difference/mean": 0.015854064375162125, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1654.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 255.984375, + "completions/mean_terminated_length": 255.984375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.2972226142883301, + "epoch": 0.7732843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5270971962584234, + "kl": 0.060013528913259506, + "learning_rate": 9.270164672969507e-07, + "loss": -0.007, + "num_tokens": 19867592.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5072145462036133, + "sampling/importance_sampling_ratio/mean": 0.9994542598724365, + "sampling/importance_sampling_ratio/min": 0.3650263249874115, + "sampling/sampling_logp_difference/max": 1.0077857971191406, + "sampling/sampling_logp_difference/mean": 0.016513332724571228, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 244.828125, + "completions/mean_terminated_length": 244.828125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.2695668339729309, + "epoch": 0.7745098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3303767200565322, + "kl": 0.07084056735038757, + "learning_rate": 9.266454408160777e-07, + "loss": -0.0347, + "num_tokens": 19899885.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007511377334595, + "sampling/importance_sampling_ratio/min": 0.4982454776763916, + "sampling/sampling_logp_difference/max": 0.8159983158111572, + "sampling/sampling_logp_difference/mean": 0.016307897865772247, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 155.046875, + "completions/mean_terminated_length": 155.046875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.17744068801403046, + "epoch": 0.7757352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10603676583593676, + "kl": 0.07468618452548981, + "learning_rate": 9.262735482719887e-07, + "loss": 0.0007, + "num_tokens": 19923392.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002634525299072, + "sampling/importance_sampling_ratio/min": 0.5914404988288879, + "sampling/sampling_logp_difference/max": 0.8146648406982422, + "sampling/sampling_logp_difference/mean": 0.014950464479625225, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 207.546875, + "completions/mean_terminated_length": 207.546875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2222384810447693, + "epoch": 0.7769607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3618649667774445, + "kl": 0.07804533839225769, + "learning_rate": 9.259007904196021e-07, + "loss": -0.0396, + "num_tokens": 19955219.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5744249820709229, + "sampling/importance_sampling_ratio/mean": 0.999670147895813, + "sampling/importance_sampling_ratio/min": 0.459637314081192, + "sampling/sampling_logp_difference/max": 0.7773175239562988, + "sampling/sampling_logp_difference/mean": 0.014833863824605942, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 279.078125, + "completions/mean_terminated_length": 279.078125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.28159821033477783, + "epoch": 0.7781862745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4507699159557401, + "kl": 0.07019156217575073, + "learning_rate": 9.255271680155923e-07, + "loss": -0.0141, + "num_tokens": 19992616.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001819133758545, + "sampling/importance_sampling_ratio/min": 0.33303511142730713, + "sampling/sampling_logp_difference/max": 1.0995073318481445, + "sampling/sampling_logp_difference/mean": 0.015980804339051247, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 210.671875, + "completions/mean_terminated_length": 210.671875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.22452111542224884, + "epoch": 0.7794117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3651495022210556, + "kl": 0.050095200538635254, + "learning_rate": 9.251526818183896e-07, + "loss": 0.0018, + "num_tokens": 20027315.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6550902128219604, + "sampling/importance_sampling_ratio/mean": 0.9999434947967529, + "sampling/importance_sampling_ratio/min": 0.43477028608322144, + "sampling/sampling_logp_difference/max": 0.832937479019165, + "sampling/sampling_logp_difference/mean": 0.015550890006124973, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 190.734375, + "completions/mean_terminated_length": 190.734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.1741478443145752, + "epoch": 0.7806372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4642627408447395, + "kl": 0.05440554767847061, + "learning_rate": 9.247773325881769e-07, + "loss": -0.0098, + "num_tokens": 20055058.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7198848724365234, + "sampling/importance_sampling_ratio/mean": 0.9996518492698669, + "sampling/importance_sampling_ratio/min": 0.5685391426086426, + "sampling/sampling_logp_difference/max": 0.5646851062774658, + "sampling/sampling_logp_difference/mean": 0.012164910323917866, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 194.984375, + "completions/mean_terminated_length": 194.984375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.22004134953022003, + "epoch": 0.7818627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.320737729904589, + "kl": 0.07392781972885132, + "learning_rate": 9.244011210868895e-07, + "loss": -0.0062, + "num_tokens": 20085537.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.8553956747055054, + "sampling/importance_sampling_ratio/mean": 0.9997206926345825, + "sampling/importance_sampling_ratio/min": 0.42811888456344604, + "sampling/sampling_logp_difference/max": 0.8483543395996094, + "sampling/sampling_logp_difference/mean": 0.015586144290864468, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 196.03125, + "completions/mean_terminated_length": 196.03125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.19160839915275574, + "epoch": 0.7830882352941176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052688931066442, + "kl": 0.0540931262075901, + "learning_rate": 9.240240480782129e-07, + "loss": 0.0005, + "num_tokens": 20114563.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5912364721298218, + "sampling/importance_sampling_ratio/mean": 0.9991684556007385, + "sampling/importance_sampling_ratio/min": 0.42439547181129456, + "sampling/sampling_logp_difference/max": 0.8570895195007324, + "sampling/sampling_logp_difference/mean": 0.014237429946660995, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 239.296875, + "completions/mean_terminated_length": 239.296875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.27263566851615906, + "epoch": 0.7843137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5731791942277236, + "kl": 0.06309030950069427, + "learning_rate": 9.236461143275815e-07, + "loss": -0.0386, + "num_tokens": 20149846.0, + "reward": 0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.57704496383667, + "sampling/importance_sampling_ratio/mean": 1.0001354217529297, + "sampling/importance_sampling_ratio/min": 0.5680131316184998, + "sampling/sampling_logp_difference/max": 0.5656107664108276, + "sampling/sampling_logp_difference/mean": 0.015975212678313255, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 209.796875, + "completions/mean_terminated_length": 209.796875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.25127124786376953, + "epoch": 0.7855392156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8915945066494564, + "kl": 0.11664341390132904, + "learning_rate": 9.232673206021767e-07, + "loss": -0.0287, + "num_tokens": 20177657.0, + "reward": 0.125, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.8150728940963745, + "sampling/importance_sampling_ratio/mean": 1.000038743019104, + "sampling/importance_sampling_ratio/min": 0.5329506993293762, + "sampling/sampling_logp_difference/max": 0.629326343536377, + "sampling/sampling_logp_difference/mean": 0.01448842603713274, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 191.046875, + "completions/mean_terminated_length": 191.046875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.20816347002983093, + "epoch": 0.7867647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3465381563685925, + "kl": 0.08171716332435608, + "learning_rate": 9.228876676709259e-07, + "loss": -0.0336, + "num_tokens": 20205516.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6132919788360596, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.5158289670944214, + "sampling/sampling_logp_difference/max": 0.6619800329208374, + "sampling/sampling_logp_difference/mean": 0.012775387614965439, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 199.453125, + "completions/mean_terminated_length": 199.453125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2456672191619873, + "epoch": 0.7879901960784313, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.053343337626516, + "kl": 0.07074607908725739, + "learning_rate": 9.225071563045006e-07, + "loss": -0.027, + "num_tokens": 20233449.0, + "reward": 0.5, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006208419799805, + "sampling/importance_sampling_ratio/min": 0.4944218397140503, + "sampling/sampling_logp_difference/max": 0.9277139902114868, + "sampling/sampling_logp_difference/mean": 0.014937615022063255, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 172.171875, + "completions/mean_terminated_length": 172.171875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.22118590772151947, + "epoch": 0.7892156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0263598295111773, + "kl": 0.07796844840049744, + "learning_rate": 9.221257872753144e-07, + "loss": 0.007, + "num_tokens": 20261348.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992664456367493, + "sampling/importance_sampling_ratio/min": 0.3534892499446869, + "sampling/sampling_logp_difference/max": 1.0399022102355957, + "sampling/sampling_logp_difference/mean": 0.016085071489214897, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 152.078125, + "completions/mean_terminated_length": 152.078125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.27297282218933105, + "epoch": 0.7904411764705882, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.331542394983071, + "kl": 0.1148001030087471, + "learning_rate": 9.217435613575226e-07, + "loss": 0.0129, + "num_tokens": 20286313.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8570010662078857, + "sampling/importance_sampling_ratio/mean": 1.000152587890625, + "sampling/importance_sampling_ratio/min": 0.6056237816810608, + "sampling/sampling_logp_difference/max": 0.6189628839492798, + "sampling/sampling_logp_difference/mean": 0.01716921478509903, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 162.703125, + "completions/mean_terminated_length": 162.703125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.19279450178146362, + "epoch": 0.7916666666666666, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.211441792495886, + "kl": 0.07502549886703491, + "learning_rate": 9.213604793270196e-07, + "loss": -0.008, + "num_tokens": 20311382.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006800889968872, + "sampling/importance_sampling_ratio/min": 0.2587150037288666, + "sampling/sampling_logp_difference/max": 1.3520281314849854, + "sampling/sampling_logp_difference/mean": 0.014370124787092209, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 263.8125, + "completions/mean_terminated_length": 263.8125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.29367929697036743, + "epoch": 0.7928921568627451, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6522798136680616, + "kl": 0.0979141965508461, + "learning_rate": 9.209765419614373e-07, + "loss": 0.0023, + "num_tokens": 20343178.0, + "reward": 0.0625, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5994728803634644, + "sampling/importance_sampling_ratio/mean": 1.0001908540725708, + "sampling/importance_sampling_ratio/min": 0.296130895614624, + "sampling/sampling_logp_difference/max": 1.2169537544250488, + "sampling/sampling_logp_difference/mean": 0.01599789410829544, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 250.40625, + "completions/mean_terminated_length": 250.40625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2368079423904419, + "epoch": 0.7941176470588235, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7249496990139329, + "kl": 0.05633680149912834, + "learning_rate": 9.205917500401447e-07, + "loss": -0.0046, + "num_tokens": 20377812.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.7612030506134033, + "sampling/importance_sampling_ratio/mean": 1.0002657175064087, + "sampling/importance_sampling_ratio/min": 0.00012346301809884608, + "sampling/sampling_logp_difference/max": 8.999568939208984, + "sampling/sampling_logp_difference/mean": 0.014381260611116886, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 189.859375, + "completions/mean_terminated_length": 189.859375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.27118250727653503, + "epoch": 0.7953431372549019, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.91543302081558, + "kl": 0.0993255227804184, + "learning_rate": 9.202061043442447e-07, + "loss": -0.0498, + "num_tokens": 20405339.0, + "reward": 0.25, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003424882888794, + "sampling/importance_sampling_ratio/min": 0.3208965063095093, + "sampling/sampling_logp_difference/max": 1.1366366147994995, + "sampling/sampling_logp_difference/mean": 0.015637464821338654, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 218.90625, + "completions/mean_terminated_length": 218.90625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.31074705719947815, + "epoch": 0.7965686274509803, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.765669451975907, + "kl": 0.06831445544958115, + "learning_rate": 9.198196056565738e-07, + "loss": -0.0094, + "num_tokens": 20438261.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000112533569336, + "sampling/importance_sampling_ratio/min": 0.37548714876174927, + "sampling/sampling_logp_difference/max": 0.9795310497283936, + "sampling/sampling_logp_difference/mean": 0.017085541039705276, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 279.390625, + "completions/mean_terminated_length": 279.390625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.29999005794525146, + "epoch": 0.7977941176470589, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1497807854724997, + "kl": 0.06688694655895233, + "learning_rate": 9.194322547616997e-07, + "loss": -0.0281, + "num_tokens": 20474110.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996974468231201, + "sampling/importance_sampling_ratio/min": 0.42440560460090637, + "sampling/sampling_logp_difference/max": 0.8570656776428223, + "sampling/sampling_logp_difference/mean": 0.015580703504383564, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 315.265625, + "completions/mean_terminated_length": 315.265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2834051251411438, + "epoch": 0.7990196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.226907817093407, + "kl": 0.06724686920642853, + "learning_rate": 9.190440524459202e-07, + "loss": 0.0032, + "num_tokens": 20515055.0, + "reward": 0.09375, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.775849461555481, + "sampling/importance_sampling_ratio/mean": 1.0002250671386719, + "sampling/importance_sampling_ratio/min": 0.3902806341648102, + "sampling/sampling_logp_difference/max": 0.9408892393112183, + "sampling/sampling_logp_difference/mean": 0.015994053333997726, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 294.40625, + "completions/mean_terminated_length": 294.40625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.21584855020046234, + "epoch": 0.8002450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0190111017409385, + "kl": 0.07001065462827682, + "learning_rate": 9.186549994972616e-07, + "loss": 0.0081, + "num_tokens": 20553129.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996395707130432, + "sampling/importance_sampling_ratio/min": 0.4057103395462036, + "sampling/sampling_logp_difference/max": 0.9528882503509521, + "sampling/sampling_logp_difference/mean": 0.012972021475434303, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 294.984375, + "completions/mean_terminated_length": 294.984375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3439209461212158, + "epoch": 0.8014705882352942, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6817432592765151, + "kl": 0.10327893495559692, + "learning_rate": 9.182650967054766e-07, + "loss": -0.0347, + "num_tokens": 20591640.0, + "reward": 0.78125, + "reward_std": 0.519389271736145, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000304937362671, + "sampling/importance_sampling_ratio/min": 0.051093198359012604, + "sampling/sampling_logp_difference/max": 2.9741039276123047, + "sampling/sampling_logp_difference/mean": 0.016730263829231262, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 252.15625, + "completions/mean_terminated_length": 252.15625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.3634037971496582, + "epoch": 0.8026960784313726, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.508760975787, + "kl": 0.09479008615016937, + "learning_rate": 9.178743448620431e-07, + "loss": 0.0139, + "num_tokens": 20625266.0, + "reward": 0.03125, + "reward_std": 0.7667282819747925, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004658699035645, + "sampling/importance_sampling_ratio/min": 0.5634726285934448, + "sampling/sampling_logp_difference/max": 0.9256799221038818, + "sampling/sampling_logp_difference/mean": 0.01804208755493164, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 309.25, + "completions/mean_terminated_length": 309.25, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.2615200877189636, + "epoch": 0.803921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.267371213485665, + "kl": 0.09800644218921661, + "learning_rate": 9.174827447601627e-07, + "loss": 0.0391, + "num_tokens": 20661058.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6629855632781982, + "sampling/importance_sampling_ratio/mean": 0.9998881816864014, + "sampling/importance_sampling_ratio/min": 0.4334590435028076, + "sampling/sampling_logp_difference/max": 0.8359580039978027, + "sampling/sampling_logp_difference/mean": 0.011825084686279297, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1392.0, + "completions/max_terminated_length": 1392.0, + "completions/mean_length": 311.828125, + "completions/mean_terminated_length": 311.828125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.2893800139427185, + "epoch": 0.8051470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9510614377495171, + "kl": 0.09265218675136566, + "learning_rate": 9.170902971947588e-07, + "loss": -0.006, + "num_tokens": 20698935.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.7520533800125122, + "sampling/importance_sampling_ratio/mean": 0.9994891881942749, + "sampling/importance_sampling_ratio/min": 0.26814839243888855, + "sampling/sampling_logp_difference/max": 1.3162147998809814, + "sampling/sampling_logp_difference/mean": 0.015570033341646194, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 195.484375, + "completions/mean_terminated_length": 195.484375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.21507211029529572, + "epoch": 0.8063725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0788738109658864, + "kl": 0.10468155145645142, + "learning_rate": 9.166970029624749e-07, + "loss": 0.001, + "num_tokens": 20725398.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.581348180770874, + "sampling/importance_sampling_ratio/mean": 0.9999153017997742, + "sampling/importance_sampling_ratio/min": 0.6252502799034119, + "sampling/sampling_logp_difference/max": 0.4696033000946045, + "sampling/sampling_logp_difference/mean": 0.013945435173809528, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 223.9375, + "completions/mean_terminated_length": 223.9375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.22752121090888977, + "epoch": 0.8075980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06139111869251549, + "kl": 0.10037748515605927, + "learning_rate": 9.163028628616738e-07, + "loss": 0.001, + "num_tokens": 20757906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8718841075897217, + "sampling/importance_sampling_ratio/mean": 1.0001585483551025, + "sampling/importance_sampling_ratio/min": 0.5158049464225769, + "sampling/sampling_logp_difference/max": 0.6620266437530518, + "sampling/sampling_logp_difference/mean": 0.013948909007012844, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 296.078125, + "completions/mean_terminated_length": 296.078125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3812859058380127, + "epoch": 0.8088235294117647, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4307012265727914, + "kl": 0.11581394076347351, + "learning_rate": 9.159078776924345e-07, + "loss": -0.0174, + "num_tokens": 20794663.0, + "reward": 0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6047122478485107, + "sampling/importance_sampling_ratio/mean": 1.0004184246063232, + "sampling/importance_sampling_ratio/min": 0.6100687980651855, + "sampling/sampling_logp_difference/max": 0.4941835403442383, + "sampling/sampling_logp_difference/mean": 0.018269555643200874, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 301.53125, + "completions/mean_terminated_length": 301.53125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.2671017646789551, + "epoch": 0.8100490196078431, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6516933420548836, + "kl": 0.10355545580387115, + "learning_rate": 9.155120482565519e-07, + "loss": 0.0121, + "num_tokens": 20831353.0, + "reward": 0.0625, + "reward_std": 0.5879635810852051, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006881952285767, + "sampling/importance_sampling_ratio/min": 0.4191407561302185, + "sampling/sampling_logp_difference/max": 0.869548499584198, + "sampling/sampling_logp_difference/mean": 0.014300312846899033, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 230.671875, + "completions/mean_terminated_length": 230.671875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.23248368501663208, + "epoch": 0.8112745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1018616070568348, + "kl": 0.10352177917957306, + "learning_rate": 9.15115375357535e-07, + "loss": -0.0034, + "num_tokens": 20860932.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5986348390579224, + "sampling/importance_sampling_ratio/mean": 1.000486969947815, + "sampling/importance_sampling_ratio/min": 0.6132914423942566, + "sampling/sampling_logp_difference/max": 0.48891496658325195, + "sampling/sampling_logp_difference/mean": 0.013316703960299492, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 248.53125, + "completions/mean_terminated_length": 248.53125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2861756682395935, + "epoch": 0.8125, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9778410901092632, + "kl": 0.1253623366355896, + "learning_rate": 9.147178598006044e-07, + "loss": 0.0195, + "num_tokens": 20892838.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.7730779647827148, + "sampling/importance_sampling_ratio/mean": 1.0011225938796997, + "sampling/importance_sampling_ratio/min": 0.5483730435371399, + "sampling/sampling_logp_difference/max": 0.6007994413375854, + "sampling/sampling_logp_difference/mean": 0.01466517522931099, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 221.359375, + "completions/mean_terminated_length": 221.359375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3348690867424011, + "epoch": 0.8137254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06287644422205922, + "kl": 0.10089477151632309, + "learning_rate": 9.143195023926917e-07, + "loss": 0.0011, + "num_tokens": 20921373.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9979796409606934, + "sampling/importance_sampling_ratio/mean": 1.0010348558425903, + "sampling/importance_sampling_ratio/min": 0.44230785965919495, + "sampling/sampling_logp_difference/max": 0.8157491683959961, + "sampling/sampling_logp_difference/mean": 0.01741403341293335, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 367.046875, + "completions/mean_terminated_length": 367.046875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.3210401237010956, + "epoch": 0.8149509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8162083856121677, + "kl": 0.05032963678240776, + "learning_rate": 9.139203039424368e-07, + "loss": 0.0097, + "num_tokens": 20962144.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954104423523, + "sampling/importance_sampling_ratio/min": 0.43094602227211, + "sampling/sampling_logp_difference/max": 0.923588752746582, + "sampling/sampling_logp_difference/mean": 0.014368601143360138, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 196.375, + "completions/mean_terminated_length": 196.375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.33943772315979004, + "epoch": 0.8161764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.027183857117543, + "kl": 0.1429498791694641, + "learning_rate": 9.135202652601876e-07, + "loss": 0.0098, + "num_tokens": 20990488.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6068168878555298, + "sampling/importance_sampling_ratio/mean": 0.9997010827064514, + "sampling/importance_sampling_ratio/min": 0.4161491394042969, + "sampling/sampling_logp_difference/max": 0.8767116069793701, + "sampling/sampling_logp_difference/mean": 0.01838063821196556, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 325.765625, + "completions/mean_terminated_length": 325.765625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.3099474310874939, + "epoch": 0.8174019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8867191688162285, + "kl": 0.07996654510498047, + "learning_rate": 9.131193871579974e-07, + "loss": -0.0222, + "num_tokens": 21039177.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001814365386963, + "sampling/importance_sampling_ratio/min": 0.5488417148590088, + "sampling/sampling_logp_difference/max": 0.7263705730438232, + "sampling/sampling_logp_difference/mean": 0.015426268801093102, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 267.765625, + "completions/mean_terminated_length": 267.765625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.32460522651672363, + "epoch": 0.8186274509803921, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9396556694981224, + "kl": 0.13725237548351288, + "learning_rate": 9.127176704496231e-07, + "loss": -0.0005, + "num_tokens": 21077546.0, + "reward": 0.78125, + "reward_std": 0.519389271736145, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6607247591018677, + "sampling/importance_sampling_ratio/mean": 0.9998514652252197, + "sampling/importance_sampling_ratio/min": 0.30301758646965027, + "sampling/sampling_logp_difference/max": 1.1939644813537598, + "sampling/sampling_logp_difference/mean": 0.018017791211605072, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 283.59375, + "completions/mean_terminated_length": 283.59375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3330175280570984, + "epoch": 0.8198529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2150621305012839, + "kl": 0.1321268230676651, + "learning_rate": 9.123151159505241e-07, + "loss": 0.0117, + "num_tokens": 21109408.0, + "reward": -0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": -0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.8197921514511108, + "sampling/importance_sampling_ratio/mean": 0.9998218417167664, + "sampling/importance_sampling_ratio/min": 0.5043226480484009, + "sampling/sampling_logp_difference/max": 0.6845390796661377, + "sampling/sampling_logp_difference/mean": 0.015419330447912216, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 247.515625, + "completions/mean_terminated_length": 247.515625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.31832343339920044, + "epoch": 0.821078431372549, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7962435540211532, + "kl": 0.11252126097679138, + "learning_rate": 9.119117244778607e-07, + "loss": -0.0068, + "num_tokens": 21145073.0, + "reward": 0.78125, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6199977397918701, + "sampling/importance_sampling_ratio/mean": 1.0006837844848633, + "sampling/importance_sampling_ratio/min": 0.5143709182739258, + "sampling/sampling_logp_difference/max": 0.6648106575012207, + "sampling/sampling_logp_difference/mean": 0.017651362344622612, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 191.21875, + "completions/mean_terminated_length": 191.21875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.29982662200927734, + "epoch": 0.8223039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5511913501152876, + "kl": 0.100832000374794, + "learning_rate": 9.115074968504921e-07, + "loss": 0.0226, + "num_tokens": 21178943.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996993541717529, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 1.125089168548584, + "sampling/sampling_logp_difference/mean": 0.016022585332393646, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 252.46875, + "completions/mean_terminated_length": 252.46875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.2658326029777527, + "epoch": 0.8235294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8602775957763744, + "kl": 0.08079873025417328, + "learning_rate": 9.111024338889746e-07, + "loss": -0.0058, + "num_tokens": 21209245.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6587384939193726, + "sampling/importance_sampling_ratio/mean": 0.9992792010307312, + "sampling/importance_sampling_ratio/min": 0.47799503803253174, + "sampling/sampling_logp_difference/max": 0.7381548881530762, + "sampling/sampling_logp_difference/mean": 0.014690998010337353, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 235.390625, + "completions/mean_terminated_length": 235.390625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.24077001214027405, + "epoch": 0.8247549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07431353902079195, + "kl": 0.07888946682214737, + "learning_rate": 9.106965364155605e-07, + "loss": 0.0007, + "num_tokens": 21243622.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5283786058425903, + "sampling/importance_sampling_ratio/mean": 0.9998764991760254, + "sampling/importance_sampling_ratio/min": 0.5260623097419739, + "sampling/sampling_logp_difference/max": 0.6423356533050537, + "sampling/sampling_logp_difference/mean": 0.014851542189717293, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1450.0, + "completions/max_terminated_length": 1450.0, + "completions/mean_length": 247.578125, + "completions/mean_terminated_length": 247.578125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.25872746109962463, + "epoch": 0.8259803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8442583116902658, + "kl": 0.1108008325099945, + "learning_rate": 9.102898052541957e-07, + "loss": -0.0119, + "num_tokens": 21280155.0, + "reward": -0.3125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6926437616348267, + "sampling/importance_sampling_ratio/mean": 0.9992407560348511, + "sampling/importance_sampling_ratio/min": 0.5154118537902832, + "sampling/sampling_logp_difference/max": 0.662788987159729, + "sampling/sampling_logp_difference/mean": 0.014943244867026806, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 226.5, + "completions/mean_terminated_length": 226.5, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.21067966520786285, + "epoch": 0.8272058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03647778601333582, + "kl": 0.07727044075727463, + "learning_rate": 9.09882241230519e-07, + "loss": 0.0006, + "num_tokens": 21310011.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4726886749267578, + "sampling/importance_sampling_ratio/mean": 0.9998340606689453, + "sampling/importance_sampling_ratio/min": 0.6132952570915222, + "sampling/sampling_logp_difference/max": 0.4889087677001953, + "sampling/sampling_logp_difference/mean": 0.012996380217373371, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 267.984375, + "completions/mean_terminated_length": 267.984375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.2664591073989868, + "epoch": 0.8284313725490197, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2194415878196174, + "kl": 0.09758429229259491, + "learning_rate": 9.094738451718593e-07, + "loss": -0.026, + "num_tokens": 21343226.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5972039699554443, + "sampling/importance_sampling_ratio/mean": 1.000079870223999, + "sampling/importance_sampling_ratio/min": 0.6164050102233887, + "sampling/sampling_logp_difference/max": 0.4838510751724243, + "sampling/sampling_logp_difference/mean": 0.014850573614239693, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 244.796875, + "completions/mean_terminated_length": 244.796875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.25565797090530396, + "epoch": 0.8296568627450981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0563848071739717, + "kl": 0.07639829814434052, + "learning_rate": 9.09064617907235e-07, + "loss": 0.0007, + "num_tokens": 21373021.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8061039447784424, + "sampling/importance_sampling_ratio/mean": 1.0006325244903564, + "sampling/importance_sampling_ratio/min": 0.4712215065956116, + "sampling/sampling_logp_difference/max": 0.7524270415306091, + "sampling/sampling_logp_difference/mean": 0.013312511146068573, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 212.203125, + "completions/mean_terminated_length": 212.203125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.26213979721069336, + "epoch": 0.8308823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4030879522329296, + "kl": 0.1045461893081665, + "learning_rate": 9.086545602673513e-07, + "loss": -0.0309, + "num_tokens": 21401386.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6100748777389526, + "sampling/importance_sampling_ratio/mean": 1.0002021789550781, + "sampling/importance_sampling_ratio/min": 0.36000654101371765, + "sampling/sampling_logp_difference/max": 1.0216331481933594, + "sampling/sampling_logp_difference/mean": 0.015743952244520187, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 287.1875, + "completions/mean_terminated_length": 287.1875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.31720712780952454, + "epoch": 0.8321078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2377377716101043, + "kl": 0.09231787919998169, + "learning_rate": 9.082436730845993e-07, + "loss": 0.0323, + "num_tokens": 21437542.0, + "reward": 0.34375, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6108328104019165, + "sampling/importance_sampling_ratio/mean": 1.0001319646835327, + "sampling/importance_sampling_ratio/min": 0.4297125041484833, + "sampling/sampling_logp_difference/max": 0.8446389436721802, + "sampling/sampling_logp_difference/mean": 0.014885222539305687, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 271.65625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.33510950207710266, + "epoch": 0.8333333333333334, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5882620054880845, + "kl": 0.09794466197490692, + "learning_rate": 9.07831957193054e-07, + "loss": 0.0252, + "num_tokens": 21476640.0, + "reward": -0.3125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.842240333557129, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.4911081790924072, + "sampling/sampling_logp_difference/max": 0.7110908031463623, + "sampling/sampling_logp_difference/mean": 0.015907227993011475, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 191.578125, + "completions/mean_terminated_length": 191.578125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.3127414882183075, + "epoch": 0.8345588235294118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059262354954274656, + "kl": 0.11319208145141602, + "learning_rate": 9.074194134284725e-07, + "loss": 0.0011, + "num_tokens": 21507397.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000261068344116, + "sampling/importance_sampling_ratio/min": 0.19975581765174866, + "sampling/sampling_logp_difference/max": 1.6106595993041992, + "sampling/sampling_logp_difference/mean": 0.0173744335770607, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 248.9375, + "completions/mean_terminated_length": 248.9375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3500162661075592, + "epoch": 0.8357843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.653802968950841, + "kl": 0.10595337301492691, + "learning_rate": 9.070060426282924e-07, + "loss": -0.0018, + "num_tokens": 21542417.0, + "reward": 0.03125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.734398365020752, + "sampling/importance_sampling_ratio/mean": 1.0001394748687744, + "sampling/importance_sampling_ratio/min": 0.0026407502591609955, + "sampling/sampling_logp_difference/max": 5.936692237854004, + "sampling/sampling_logp_difference/mean": 0.017924603074789047, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 288.09375, + "completions/mean_terminated_length": 288.09375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.3846856355667114, + "epoch": 0.8370098039215687, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4509856277488788, + "kl": 0.07933727651834488, + "learning_rate": 9.065918456316303e-07, + "loss": -0.0385, + "num_tokens": 21575911.0, + "reward": 0.0, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9828444719314575, + "sampling/importance_sampling_ratio/mean": 0.9999169707298279, + "sampling/importance_sampling_ratio/min": 0.5102310180664062, + "sampling/sampling_logp_difference/max": 0.6845324039459229, + "sampling/sampling_logp_difference/mean": 0.01683010533452034, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 321.171875, + "completions/mean_terminated_length": 321.171875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.35549187660217285, + "epoch": 0.8382352941176471, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.105030079988878, + "kl": 0.09580346941947937, + "learning_rate": 9.061768232792802e-07, + "loss": 0.0505, + "num_tokens": 21621730.0, + "reward": 0.28125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997661113739014, + "sampling/importance_sampling_ratio/min": 0.620428204536438, + "sampling/sampling_logp_difference/max": 0.736009955406189, + "sampling/sampling_logp_difference/mean": 0.01514836959540844, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 257.46875, + "completions/mean_terminated_length": 257.46875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.34016239643096924, + "epoch": 0.8394607843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2922578121973038, + "kl": 0.11721494048833847, + "learning_rate": 9.057609764137109e-07, + "loss": 0.0309, + "num_tokens": 21658128.0, + "reward": 0.21875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.7992300987243652, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 0.36027073860168457, + "sampling/sampling_logp_difference/max": 1.0208995342254639, + "sampling/sampling_logp_difference/mean": 0.016892150044441223, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 299.421875, + "completions/mean_terminated_length": 299.421875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.37253424525260925, + "epoch": 0.8406862745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.221488784134185, + "kl": 0.11370334774255753, + "learning_rate": 9.053443058790651e-07, + "loss": 0.0238, + "num_tokens": 21696587.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004000663757324, + "sampling/importance_sampling_ratio/min": 0.5096390247344971, + "sampling/sampling_logp_difference/max": 0.776209831237793, + "sampling/sampling_logp_difference/mean": 0.016573626548051834, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 248.8125, + "completions/mean_terminated_length": 248.8125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.3833868205547333, + "epoch": 0.8419117647058824, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2787064048635344, + "kl": 0.13017961382865906, + "learning_rate": 9.049268125211575e-07, + "loss": -0.0214, + "num_tokens": 21729791.0, + "reward": 0.15625, + "reward_std": 0.5281128883361816, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.617061972618103, + "sampling/importance_sampling_ratio/mean": 0.9997603297233582, + "sampling/importance_sampling_ratio/min": 0.5065748691558838, + "sampling/sampling_logp_difference/max": 0.6800830364227295, + "sampling/sampling_logp_difference/mean": 0.01717999391257763, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 179.84375, + "completions/mean_terminated_length": 179.84375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.20183613896369934, + "epoch": 0.8431372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0681915059421622, + "kl": 0.09983283281326294, + "learning_rate": 9.045084971874737e-07, + "loss": 0.0009, + "num_tokens": 21758629.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000123977661133, + "sampling/importance_sampling_ratio/min": 0.5593070387840271, + "sampling/sampling_logp_difference/max": 0.7898805141448975, + "sampling/sampling_logp_difference/mean": 0.013118119910359383, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 255.46875, + "completions/mean_terminated_length": 255.46875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.267900288105011, + "epoch": 0.8443627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8681915213752832, + "kl": 0.09064662456512451, + "learning_rate": 9.040893607271668e-07, + "loss": 0.0087, + "num_tokens": 21799875.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8082319498062134, + "sampling/importance_sampling_ratio/mean": 1.000293254852295, + "sampling/importance_sampling_ratio/min": 0.035122405737638474, + "sampling/sampling_logp_difference/max": 3.3489160537719727, + "sampling/sampling_logp_difference/mean": 0.014291105791926384, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 194.34375, + "completions/mean_terminated_length": 194.34375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.32394784688949585, + "epoch": 0.8455882352941176, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4189443295984745, + "kl": 0.11329706013202667, + "learning_rate": 9.036694039910576e-07, + "loss": 0.0161, + "num_tokens": 21828297.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.7528784275054932, + "sampling/importance_sampling_ratio/mean": 0.9998365640640259, + "sampling/importance_sampling_ratio/min": 0.44248515367507935, + "sampling/sampling_logp_difference/max": 0.8153483867645264, + "sampling/sampling_logp_difference/mean": 0.01646936871111393, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 199.78125, + "completions/mean_terminated_length": 199.78125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3520262539386749, + "epoch": 0.8468137254901961, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1899806532441306, + "kl": 0.1304454207420349, + "learning_rate": 9.032486278316313e-07, + "loss": -0.0161, + "num_tokens": 21858795.0, + "reward": -0.1875, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002292394638062, + "sampling/importance_sampling_ratio/min": 0.4083217978477478, + "sampling/sampling_logp_difference/max": 0.895699679851532, + "sampling/sampling_logp_difference/mean": 0.01799740642309189, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 211.203125, + "completions/mean_terminated_length": 211.203125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.35917019844055176, + "epoch": 0.8480392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9798873160668752, + "kl": 0.11700897663831711, + "learning_rate": 9.028270331030372e-07, + "loss": -0.0038, + "num_tokens": 21890408.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6099342107772827, + "sampling/importance_sampling_ratio/mean": 1.0000842809677124, + "sampling/importance_sampling_ratio/min": 0.5990613698959351, + "sampling/sampling_logp_difference/max": 0.512391209602356, + "sampling/sampling_logp_difference/mean": 0.01729101501405239, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 163.296875, + "completions/mean_terminated_length": 163.296875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3510112762451172, + "epoch": 0.8492647058823529, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2123932223640246, + "kl": 0.14133670926094055, + "learning_rate": 9.024046206610857e-07, + "loss": -0.0076, + "num_tokens": 21919771.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6247493028640747, + "sampling/importance_sampling_ratio/mean": 1.0004782676696777, + "sampling/importance_sampling_ratio/min": 0.6176419258117676, + "sampling/sampling_logp_difference/max": 0.4853534698486328, + "sampling/sampling_logp_difference/mean": 0.017106808722019196, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 205.328125, + "completions/mean_terminated_length": 205.328125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.29586395621299744, + "epoch": 0.8504901960784313, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1910071691315411, + "kl": 0.11373498290777206, + "learning_rate": 9.019813913632475e-07, + "loss": 0.0064, + "num_tokens": 21950800.0, + "reward": -0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.9536798000335693, + "sampling/importance_sampling_ratio/mean": 0.9998559951782227, + "sampling/importance_sampling_ratio/min": 0.42452549934387207, + "sampling/sampling_logp_difference/max": 0.8567832708358765, + "sampling/sampling_logp_difference/mean": 0.01530240848660469, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 139.984375, + "completions/mean_terminated_length": 139.984375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.29637232422828674, + "epoch": 0.8517156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0666575119247821, + "kl": 0.11328499764204025, + "learning_rate": 9.015573460686509e-07, + "loss": 0.0012, + "num_tokens": 21976303.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.649452567100525, + "sampling/importance_sampling_ratio/mean": 1.0007152557373047, + "sampling/importance_sampling_ratio/min": 0.3134042024612427, + "sampling/sampling_logp_difference/max": 1.160261631011963, + "sampling/sampling_logp_difference/mean": 0.018013473600149155, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 155.609375, + "completions/mean_terminated_length": 155.609375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.34009021520614624, + "epoch": 0.8529411764705882, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2575099023093053, + "kl": 0.19062024354934692, + "learning_rate": 9.011324856380813e-07, + "loss": 0.0463, + "num_tokens": 22003206.0, + "reward": 0.125, + "reward_std": 0.481805682182312, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001416206359863, + "sampling/importance_sampling_ratio/min": 0.4293578863143921, + "sampling/sampling_logp_difference/max": 0.9967107772827148, + "sampling/sampling_logp_difference/mean": 0.018071670085191727, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 218.46875, + "completions/mean_terminated_length": 218.46875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.41182026267051697, + "epoch": 0.8541666666666666, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5031797223895609, + "kl": 0.13558265566825867, + "learning_rate": 9.007068109339783e-07, + "loss": -0.0399, + "num_tokens": 22036180.0, + "reward": 0.53125, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000173807144165, + "sampling/importance_sampling_ratio/min": 0.5055890083312988, + "sampling/sampling_logp_difference/max": 1.032625675201416, + "sampling/sampling_logp_difference/mean": 0.01840224117040634, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 181.015625, + "completions/mean_terminated_length": 181.015625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.28813767433166504, + "epoch": 0.8553921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3615341271921366, + "kl": 0.09089133143424988, + "learning_rate": 9.002803228204348e-07, + "loss": 0.0068, + "num_tokens": 22067205.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821186065674, + "sampling/importance_sampling_ratio/min": 0.4525822401046753, + "sampling/sampling_logp_difference/max": 0.987877607345581, + "sampling/sampling_logp_difference/mean": 0.015335430391132832, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 196.90625, + "completions/mean_terminated_length": 196.90625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3749622106552124, + "epoch": 0.8566176470588235, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1167853118622275, + "kl": 0.14343884587287903, + "learning_rate": 8.998530221631941e-07, + "loss": 0.0398, + "num_tokens": 22099855.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.605394721031189, + "sampling/importance_sampling_ratio/mean": 0.9992402791976929, + "sampling/importance_sampling_ratio/min": 0.40643367171287537, + "sampling/sampling_logp_difference/max": 0.9003345966339111, + "sampling/sampling_logp_difference/mean": 0.017833959311246872, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 204.109375, + "completions/mean_terminated_length": 204.109375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.32660678029060364, + "epoch": 0.8578431372549019, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2200355245520216, + "kl": 0.1060405820608139, + "learning_rate": 8.994249098296502e-07, + "loss": 0.0273, + "num_tokens": 22130614.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5495752096176147, + "sampling/importance_sampling_ratio/mean": 0.9998863935470581, + "sampling/importance_sampling_ratio/min": 0.6147378087043762, + "sampling/sampling_logp_difference/max": 0.4865594506263733, + "sampling/sampling_logp_difference/mean": 0.015310069546103477, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 156.265625, + "completions/mean_terminated_length": 156.265625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3106486201286316, + "epoch": 0.8590686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22984067557623256, + "kl": 0.12091071158647537, + "learning_rate": 8.989959866888437e-07, + "loss": 0.0012, + "num_tokens": 22160279.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6522029638290405, + "sampling/importance_sampling_ratio/mean": 0.9998528361320496, + "sampling/importance_sampling_ratio/min": 0.09124992042779922, + "sampling/sampling_logp_difference/max": 2.394153118133545, + "sampling/sampling_logp_difference/mean": 0.016085583716630936, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 157.46875, + "completions/mean_terminated_length": 157.46875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.24609866738319397, + "epoch": 0.8602941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09035589869489256, + "kl": 0.12495503574609756, + "learning_rate": 8.985662536114612e-07, + "loss": 0.0012, + "num_tokens": 22189877.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9663197994232178, + "sampling/importance_sampling_ratio/mean": 0.9997924566268921, + "sampling/importance_sampling_ratio/min": 0.3909452557563782, + "sampling/sampling_logp_difference/max": 0.9391877055168152, + "sampling/sampling_logp_difference/mean": 0.015291010960936546, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 184.0, + "completions/mean_terminated_length": 184.0, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.23346850275993347, + "epoch": 0.8615196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0788715941297072, + "kl": 0.09071210026741028, + "learning_rate": 8.981357114698338e-07, + "loss": 0.0009, + "num_tokens": 22223541.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7657884359359741, + "sampling/importance_sampling_ratio/mean": 1.0000474452972412, + "sampling/importance_sampling_ratio/min": 0.5173292756080627, + "sampling/sampling_logp_difference/max": 0.6590757369995117, + "sampling/sampling_logp_difference/mean": 0.012250609695911407, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 175.265625, + "completions/mean_terminated_length": 175.265625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3375943899154663, + "epoch": 0.8627450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.406638942731287, + "kl": 0.1338985413312912, + "learning_rate": 8.977043611379349e-07, + "loss": -0.0306, + "num_tokens": 22251942.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.9877376556396484, + "sampling/importance_sampling_ratio/mean": 0.9998382925987244, + "sampling/importance_sampling_ratio/min": 0.5764951109886169, + "sampling/sampling_logp_difference/max": 0.6869971752166748, + "sampling/sampling_logp_difference/mean": 0.017610445618629456, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 198.65625, + "completions/mean_terminated_length": 198.65625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.24917840957641602, + "epoch": 0.8639705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07326889467215009, + "kl": 0.08637434244155884, + "learning_rate": 8.972722034913781e-07, + "loss": 0.0009, + "num_tokens": 22288288.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5786354541778564, + "sampling/importance_sampling_ratio/mean": 1.000070571899414, + "sampling/importance_sampling_ratio/min": 0.5260618925094604, + "sampling/sampling_logp_difference/max": 0.642336368560791, + "sampling/sampling_logp_difference/mean": 0.01618223637342453, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 164.3125, + "completions/mean_terminated_length": 164.3125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3293379843235016, + "epoch": 0.8651960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.923492823775697, + "kl": 0.19896593689918518, + "learning_rate": 8.968392394074163e-07, + "loss": 0.0024, + "num_tokens": 22314868.0, + "reward": -0.0625, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995007514953613, + "sampling/importance_sampling_ratio/min": 0.5738527774810791, + "sampling/sampling_logp_difference/max": 0.7254873514175415, + "sampling/sampling_logp_difference/mean": 0.01853271946310997, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 169.65625, + "completions/mean_terminated_length": 169.65625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.29563915729522705, + "epoch": 0.866421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.316755352173363, + "kl": 0.1290314793586731, + "learning_rate": 8.964054697649388e-07, + "loss": 0.038, + "num_tokens": 22344814.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.681364893913269, + "sampling/importance_sampling_ratio/mean": 0.9994768500328064, + "sampling/importance_sampling_ratio/min": 0.2286689728498459, + "sampling/sampling_logp_difference/max": 1.4754798412322998, + "sampling/sampling_logp_difference/mean": 0.01796390675008297, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 173.484375, + "completions/mean_terminated_length": 173.484375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.2853643000125885, + "epoch": 0.8676470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07137457226982497, + "kl": 0.11410938203334808, + "learning_rate": 8.959708954444708e-07, + "loss": 0.0011, + "num_tokens": 22369357.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996079802513123, + "sampling/importance_sampling_ratio/min": 0.5437241792678833, + "sampling/sampling_logp_difference/max": 1.5145487785339355, + "sampling/sampling_logp_difference/mean": 0.017212729901075363, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 149.46875, + "completions/mean_terminated_length": 149.46875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3079431653022766, + "epoch": 0.8688725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4540088112938152, + "kl": 0.1305399388074875, + "learning_rate": 8.955355173281707e-07, + "loss": 0.0104, + "num_tokens": 22394171.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6495933532714844, + "sampling/importance_sampling_ratio/mean": 0.9999434351921082, + "sampling/importance_sampling_ratio/min": 0.5773681402206421, + "sampling/sampling_logp_difference/max": 0.549275279045105, + "sampling/sampling_logp_difference/mean": 0.016211165115237236, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 177.015625, + "completions/mean_terminated_length": 177.015625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2947250008583069, + "epoch": 0.8700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3316971512390994, + "kl": 0.11985829472541809, + "learning_rate": 8.95099336299828e-07, + "loss": 0.0029, + "num_tokens": 22424316.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6344491243362427, + "sampling/importance_sampling_ratio/mean": 0.999924898147583, + "sampling/importance_sampling_ratio/min": 0.5699969530105591, + "sampling/sampling_logp_difference/max": 0.5621242523193359, + "sampling/sampling_logp_difference/mean": 0.017383884638547897, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 180.46875, + "completions/mean_terminated_length": 180.46875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.27492639422416687, + "epoch": 0.8713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05769349472093441, + "kl": 0.12104004621505737, + "learning_rate": 8.946623532448631e-07, + "loss": 0.0011, + "num_tokens": 22454314.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5753906965255737, + "sampling/importance_sampling_ratio/mean": 0.9996606111526489, + "sampling/importance_sampling_ratio/min": 0.5292960405349731, + "sampling/sampling_logp_difference/max": 0.6362073421478271, + "sampling/sampling_logp_difference/mean": 0.014977009035646915, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 152.40625, + "completions/mean_terminated_length": 152.40625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.2546907663345337, + "epoch": 0.8725490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5431543846984839, + "kl": 0.1233743354678154, + "learning_rate": 8.942245690503238e-07, + "loss": 0.012, + "num_tokens": 22479860.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5062928199768066, + "sampling/importance_sampling_ratio/mean": 0.9999198913574219, + "sampling/importance_sampling_ratio/min": 0.6306304335594177, + "sampling/sampling_logp_difference/max": 0.46103525161743164, + "sampling/sampling_logp_difference/mean": 0.012808259576559067, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 179.40625, + "completions/mean_terminated_length": 179.40625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.21484437584877014, + "epoch": 0.8737745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0898677002345635, + "kl": 0.10381156206130981, + "learning_rate": 8.937859846048842e-07, + "loss": 0.001, + "num_tokens": 22507998.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5619345903396606, + "sampling/importance_sampling_ratio/mean": 0.9998908042907715, + "sampling/importance_sampling_ratio/min": 0.607501208782196, + "sampling/sampling_logp_difference/max": 0.4984011650085449, + "sampling/sampling_logp_difference/mean": 0.012432791292667389, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 224.734375, + "completions/mean_terminated_length": 224.734375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3591345250606537, + "epoch": 0.875, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.438804498440999, + "kl": 0.12299526482820511, + "learning_rate": 8.933466007988429e-07, + "loss": 0.0304, + "num_tokens": 22539133.0, + "reward": -0.1875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.5407992601394653, + "sampling/importance_sampling_ratio/mean": 0.999602735042572, + "sampling/importance_sampling_ratio/min": 0.5887510776519775, + "sampling/sampling_logp_difference/max": 0.5297517776489258, + "sampling/sampling_logp_difference/mean": 0.016061928123235703, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 131.34375, + "completions/mean_terminated_length": 131.34375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.21670114994049072, + "epoch": 0.8762254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0910943446591457, + "kl": 0.10475843399763107, + "learning_rate": 8.929064185241212e-07, + "loss": 0.0011, + "num_tokens": 22559667.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.798926591873169, + "sampling/importance_sampling_ratio/mean": 1.0005546808242798, + "sampling/importance_sampling_ratio/min": 0.4834917485713959, + "sampling/sampling_logp_difference/max": 0.7267210483551025, + "sampling/sampling_logp_difference/mean": 0.01417156495153904, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 205.71875, + "completions/mean_terminated_length": 205.71875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.24957765638828278, + "epoch": 0.8774509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1269812297058832, + "kl": 0.08890828490257263, + "learning_rate": 8.924654386742611e-07, + "loss": 0.0595, + "num_tokens": 22589553.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5645136833190918, + "sampling/importance_sampling_ratio/mean": 1.0001609325408936, + "sampling/importance_sampling_ratio/min": 0.4951017498970032, + "sampling/sampling_logp_difference/max": 0.7029919624328613, + "sampling/sampling_logp_difference/mean": 0.01214967854321003, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 158.46875, + "completions/mean_terminated_length": 158.46875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.26717567443847656, + "epoch": 0.8786764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06135570342970846, + "kl": 0.11892708390951157, + "learning_rate": 8.920236621444242e-07, + "loss": 0.0012, + "num_tokens": 22617359.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6179018020629883, + "sampling/importance_sampling_ratio/mean": 1.000462532043457, + "sampling/importance_sampling_ratio/min": 0.6216425895690918, + "sampling/sampling_logp_difference/max": 0.48113012313842773, + "sampling/sampling_logp_difference/mean": 0.01484605297446251, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 133.203125, + "completions/mean_terminated_length": 133.203125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.26337453722953796, + "epoch": 0.8799019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3501941892853542, + "kl": 0.1662946194410324, + "learning_rate": 8.915810898313884e-07, + "loss": -0.0012, + "num_tokens": 22645692.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.8564430475234985, + "sampling/importance_sampling_ratio/mean": 1.0003392696380615, + "sampling/importance_sampling_ratio/min": 0.5689576268196106, + "sampling/sampling_logp_difference/max": 0.6186623573303223, + "sampling/sampling_logp_difference/mean": 0.017064658924937248, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 163.921875, + "completions/mean_terminated_length": 163.921875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.28888508677482605, + "epoch": 0.8811274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.392013915287687, + "kl": 0.1110411211848259, + "learning_rate": 8.911377226335478e-07, + "loss": -0.0236, + "num_tokens": 22678407.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000640153884888, + "sampling/importance_sampling_ratio/min": 0.3980575203895569, + "sampling/sampling_logp_difference/max": 0.9211587905883789, + "sampling/sampling_logp_difference/mean": 0.016323750838637352, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 191.9375, + "completions/mean_terminated_length": 191.9375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2733198404312134, + "epoch": 0.8823529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.826168728400338, + "kl": 0.13752731680870056, + "learning_rate": 8.906935614509095e-07, + "loss": 0.0313, + "num_tokens": 22706403.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5975159406661987, + "sampling/importance_sampling_ratio/mean": 1.0002658367156982, + "sampling/importance_sampling_ratio/min": 0.6048009991645813, + "sampling/sampling_logp_difference/max": 0.5028557777404785, + "sampling/sampling_logp_difference/mean": 0.01455213874578476, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 164.828125, + "completions/mean_terminated_length": 164.828125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4618142247200012, + "epoch": 0.883578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8854194347941413, + "kl": 0.21749398112297058, + "learning_rate": 8.902486071850926e-07, + "loss": 0.0349, + "num_tokens": 22739864.0, + "reward": -0.09375, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.8903597593307495, + "sampling/importance_sampling_ratio/mean": 0.9992150664329529, + "sampling/importance_sampling_ratio/min": 0.526161253452301, + "sampling/sampling_logp_difference/max": 0.6421475410461426, + "sampling/sampling_logp_difference/mean": 0.021756840869784355, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 138.4375, + "completions/mean_terminated_length": 138.4375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3331761360168457, + "epoch": 0.8848039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.610386235911436, + "kl": 0.15505921840667725, + "learning_rate": 8.89802860739326e-07, + "loss": 0.0143, + "num_tokens": 22769428.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009257793426514, + "sampling/importance_sampling_ratio/min": 0.5311965346336365, + "sampling/sampling_logp_difference/max": 0.7143645286560059, + "sampling/sampling_logp_difference/mean": 0.018353819847106934, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 171.828125, + "completions/mean_terminated_length": 171.828125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.35276901721954346, + "epoch": 0.8860294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2634663462798594, + "kl": 0.12248918414115906, + "learning_rate": 8.89356323018447e-07, + "loss": -0.0115, + "num_tokens": 22799625.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6889691352844238, + "sampling/importance_sampling_ratio/mean": 1.000432014465332, + "sampling/importance_sampling_ratio/min": 0.46902188658714294, + "sampling/sampling_logp_difference/max": 0.757105827331543, + "sampling/sampling_logp_difference/mean": 0.01768193021416664, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 124.453125, + "completions/mean_terminated_length": 124.453125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3187045753002167, + "epoch": 0.8872549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.6120365143953608, + "kl": 0.2303055375814438, + "learning_rate": 8.889089949288986e-07, + "loss": -0.019, + "num_tokens": 22821702.0, + "reward": 0.6875, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.730854868888855, + "sampling/importance_sampling_ratio/mean": 1.000281810760498, + "sampling/importance_sampling_ratio/min": 0.6541826725006104, + "sampling/sampling_logp_difference/max": 0.5486154556274414, + "sampling/sampling_logp_difference/mean": 0.018116045743227005, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 132.796875, + "completions/mean_terminated_length": 132.796875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.18036410212516785, + "epoch": 0.8884803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1824288888284623, + "kl": 0.10626198351383209, + "learning_rate": 8.884608773787288e-07, + "loss": 0.001, + "num_tokens": 22843641.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4594168663024902, + "sampling/importance_sampling_ratio/mean": 1.0004578828811646, + "sampling/importance_sampling_ratio/min": 0.49541574716567993, + "sampling/sampling_logp_difference/max": 0.7023580074310303, + "sampling/sampling_logp_difference/mean": 0.012088064104318619, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 172.65625, + "completions/mean_terminated_length": 172.65625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.31260034441947937, + "epoch": 0.8897058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8754439548088546, + "kl": 0.1252618134021759, + "learning_rate": 8.880119712775875e-07, + "loss": 0.0219, + "num_tokens": 22873123.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.595409870147705, + "sampling/importance_sampling_ratio/mean": 1.0009194612503052, + "sampling/importance_sampling_ratio/min": 0.5038020610809326, + "sampling/sampling_logp_difference/max": 0.6855719089508057, + "sampling/sampling_logp_difference/mean": 0.015545779839158058, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 176.46875, + "completions/mean_terminated_length": 176.46875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.35207128524780273, + "epoch": 0.8909313725490197, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2894315350001015, + "kl": 0.14700110256671906, + "learning_rate": 8.875622775367259e-07, + "loss": 0.0803, + "num_tokens": 22900337.0, + "reward": 0.84375, + "reward_std": 0.46656501293182373, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6219651699066162, + "sampling/importance_sampling_ratio/mean": 1.0002046823501587, + "sampling/importance_sampling_ratio/min": 0.3306296169757843, + "sampling/sampling_logp_difference/max": 1.106756567955017, + "sampling/sampling_logp_difference/mean": 0.017628487199544907, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.442595899105072, + "epoch": 0.8921568627450981, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9493746234582252, + "kl": 0.12233205139636993, + "learning_rate": 8.871117970689937e-07, + "loss": 0.0071, + "num_tokens": 22934025.0, + "reward": 0.03125, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995599985122681, + "sampling/importance_sampling_ratio/min": 0.22002194821834564, + "sampling/sampling_logp_difference/max": 1.5140279531478882, + "sampling/sampling_logp_difference/mean": 0.020226042717695236, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 167.8125, + "completions/mean_terminated_length": 167.8125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.273113489151001, + "epoch": 0.8933823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.431421698045663, + "kl": 0.12466256320476532, + "learning_rate": 8.866605307888376e-07, + "loss": 0.032, + "num_tokens": 22961757.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6836470365524292, + "sampling/importance_sampling_ratio/mean": 1.0004937648773193, + "sampling/importance_sampling_ratio/min": 0.49846717715263367, + "sampling/sampling_logp_difference/max": 0.6962175369262695, + "sampling/sampling_logp_difference/mean": 0.014166696928441525, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 171.328125, + "completions/mean_terminated_length": 171.328125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3084203004837036, + "epoch": 0.8946078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1807505881028106, + "kl": 0.1305488795042038, + "learning_rate": 8.862084796122997e-07, + "loss": 0.0412, + "num_tokens": 22991938.0, + "reward": 0.21875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.775182843208313, + "sampling/importance_sampling_ratio/mean": 1.000341773033142, + "sampling/importance_sampling_ratio/min": 0.512789785861969, + "sampling/sampling_logp_difference/max": 0.6678893566131592, + "sampling/sampling_logp_difference/mean": 0.017210688441991806, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 133.984375, + "completions/mean_terminated_length": 133.984375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3547746539115906, + "epoch": 0.8958333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8947466005816809, + "kl": 0.17236892879009247, + "learning_rate": 8.857556444570153e-07, + "loss": -0.0089, + "num_tokens": 23018289.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993709921836853, + "sampling/importance_sampling_ratio/min": 0.5687522292137146, + "sampling/sampling_logp_difference/max": 0.7245337963104248, + "sampling/sampling_logp_difference/mean": 0.016643021255731583, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 160.734375, + "completions/mean_terminated_length": 160.734375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3138841986656189, + "epoch": 0.8970588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.344781421912304, + "kl": 0.11245203018188477, + "learning_rate": 8.853020262422109e-07, + "loss": -0.0154, + "num_tokens": 23042752.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.7926790714263916, + "sampling/importance_sampling_ratio/mean": 0.9999420642852783, + "sampling/importance_sampling_ratio/min": 0.6133373975753784, + "sampling/sampling_logp_difference/max": 0.5837111473083496, + "sampling/sampling_logp_difference/mean": 0.015560884959995747, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 190.921875, + "completions/mean_terminated_length": 190.921875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2579788565635681, + "epoch": 0.8982843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.150261385713446, + "kl": 0.10539811849594116, + "learning_rate": 8.84847625888703e-07, + "loss": 0.0217, + "num_tokens": 23076363.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.621700406074524, + "sampling/importance_sampling_ratio/mean": 0.999610424041748, + "sampling/importance_sampling_ratio/min": 0.6068662405014038, + "sampling/sampling_logp_difference/max": 0.4994468688964844, + "sampling/sampling_logp_difference/mean": 0.012584343552589417, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 133.25, + "completions/mean_terminated_length": 133.25, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.3112919330596924, + "epoch": 0.8995098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6945912421821459, + "kl": 0.16028791666030884, + "learning_rate": 8.843924443188953e-07, + "loss": -0.0124, + "num_tokens": 23104299.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.8255785703659058, + "sampling/importance_sampling_ratio/mean": 0.9997628927230835, + "sampling/importance_sampling_ratio/min": 0.39045611023902893, + "sampling/sampling_logp_difference/max": 0.9404397010803223, + "sampling/sampling_logp_difference/mean": 0.017352301627397537, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 166.328125, + "completions/mean_terminated_length": 166.328125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3333016633987427, + "epoch": 0.9007352941176471, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.136442992868966, + "kl": 0.19512274861335754, + "learning_rate": 8.839364824567775e-07, + "loss": -0.0368, + "num_tokens": 23131616.0, + "reward": 0.28125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.7447431087493896, + "sampling/importance_sampling_ratio/mean": 0.9993703365325928, + "sampling/importance_sampling_ratio/min": 0.5949843525886536, + "sampling/sampling_logp_difference/max": 0.5566072463989258, + "sampling/sampling_logp_difference/mean": 0.01808866485953331, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 239.25, + "completions/mean_terminated_length": 239.25, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.39610356092453003, + "epoch": 0.9019607843137255, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.933601120837525, + "kl": 0.13343805074691772, + "learning_rate": 8.834797412279235e-07, + "loss": -0.0563, + "num_tokens": 23167712.0, + "reward": 0.4375, + "reward_std": 0.5738953948020935, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9925565719604492, + "sampling/importance_sampling_ratio/mean": 1.0004668235778809, + "sampling/importance_sampling_ratio/min": 0.4509410262107849, + "sampling/sampling_logp_difference/max": 0.7964186668395996, + "sampling/sampling_logp_difference/mean": 0.01696062460541725, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 131.859375, + "completions/mean_terminated_length": 131.859375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3019437789916992, + "epoch": 0.9031862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5423557238277512, + "kl": 0.19669732451438904, + "learning_rate": 8.83022221559489e-07, + "loss": -0.0215, + "num_tokens": 23190471.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4498867988586426, + "sampling/importance_sampling_ratio/mean": 1.0004997253417969, + "sampling/importance_sampling_ratio/min": 0.6259177327156067, + "sampling/sampling_logp_difference/max": 0.468536376953125, + "sampling/sampling_logp_difference/mean": 0.015907293185591698, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 145.484375, + "completions/mean_terminated_length": 145.484375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.30380311608314514, + "epoch": 0.9044117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.355611611588332, + "kl": 0.14590933918952942, + "learning_rate": 8.825639243802098e-07, + "loss": 0.0064, + "num_tokens": 23222038.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.964614987373352, + "sampling/importance_sampling_ratio/mean": 1.0001753568649292, + "sampling/importance_sampling_ratio/min": 0.6057144403457642, + "sampling/sampling_logp_difference/max": 0.6752963066101074, + "sampling/sampling_logp_difference/mean": 0.01644541323184967, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 174.5, + "completions/mean_terminated_length": 174.5, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2797512412071228, + "epoch": 0.9056372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4076118802180047, + "kl": 0.10469253361225128, + "learning_rate": 8.821048506204005e-07, + "loss": -0.063, + "num_tokens": 23247302.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.5281046628952026, + "sampling/importance_sampling_ratio/mean": 0.9992679357528687, + "sampling/importance_sampling_ratio/min": 0.48816025257110596, + "sampling/sampling_logp_difference/max": 0.7171115875244141, + "sampling/sampling_logp_difference/mean": 0.013918949291110039, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 202.890625, + "completions/mean_terminated_length": 202.890625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.38396427035331726, + "epoch": 0.9068627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2545918080625817, + "kl": 0.142167866230011, + "learning_rate": 8.816450012119513e-07, + "loss": 0.0014, + "num_tokens": 23283855.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002930164337158, + "sampling/importance_sampling_ratio/min": 0.6144245266914368, + "sampling/sampling_logp_difference/max": 0.7409529685974121, + "sampling/sampling_logp_difference/mean": 0.01684897020459175, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 168.515625, + "completions/mean_terminated_length": 168.515625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.288840115070343, + "epoch": 0.9080882352941176, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9517173861653833, + "kl": 0.11101772636175156, + "learning_rate": 8.811843770883276e-07, + "loss": -0.0023, + "num_tokens": 23312512.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265670776367, + "sampling/importance_sampling_ratio/min": 0.4923335611820221, + "sampling/sampling_logp_difference/max": 0.7541918754577637, + "sampling/sampling_logp_difference/mean": 0.014561197720468044, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 250.140625, + "completions/mean_terminated_length": 250.140625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.42110323905944824, + "epoch": 0.9093137254901961, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.944178334035574, + "kl": 0.11443128436803818, + "learning_rate": 8.807229791845671e-07, + "loss": -0.0144, + "num_tokens": 23348249.0, + "reward": -0.03125, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.815877079963684, + "sampling/importance_sampling_ratio/mean": 1.000603437423706, + "sampling/importance_sampling_ratio/min": 0.5373468995094299, + "sampling/sampling_logp_difference/max": 0.6211113929748535, + "sampling/sampling_logp_difference/mean": 0.017753958702087402, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 197.09375, + "completions/mean_terminated_length": 197.09375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2682134509086609, + "epoch": 0.9105392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3822187220239395, + "kl": 0.1007651761174202, + "learning_rate": 8.802608084372785e-07, + "loss": 0.0442, + "num_tokens": 23382143.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6189647912979126, + "sampling/importance_sampling_ratio/mean": 0.999918520450592, + "sampling/importance_sampling_ratio/min": 0.5457268357276917, + "sampling/sampling_logp_difference/max": 0.6056368350982666, + "sampling/sampling_logp_difference/mean": 0.013500851579010487, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 224.984375, + "completions/mean_terminated_length": 224.984375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3589177429676056, + "epoch": 0.9117647058823529, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9843702764558497, + "kl": 0.11899229139089584, + "learning_rate": 8.79797865784639e-07, + "loss": -0.0247, + "num_tokens": 23413582.0, + "reward": 0.03125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.762001872062683, + "sampling/importance_sampling_ratio/mean": 1.0001921653747559, + "sampling/importance_sampling_ratio/min": 0.6599255204200745, + "sampling/sampling_logp_difference/max": 0.5664505958557129, + "sampling/sampling_logp_difference/mean": 0.015161126852035522, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 214.921875, + "completions/mean_terminated_length": 214.921875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3481399714946747, + "epoch": 0.9129901960784313, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.37079857009415, + "kl": 0.12129916250705719, + "learning_rate": 8.793341521663928e-07, + "loss": 0.0648, + "num_tokens": 23446041.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5652804374694824, + "sampling/importance_sampling_ratio/mean": 1.000514030456543, + "sampling/importance_sampling_ratio/min": 0.5542005300521851, + "sampling/sampling_logp_difference/max": 0.5902286767959595, + "sampling/sampling_logp_difference/mean": 0.014764299616217613, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 179.90625, + "completions/mean_terminated_length": 179.90625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.26453089714050293, + "epoch": 0.9142156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08713475356444839, + "kl": 0.1045544445514679, + "learning_rate": 8.788696685238494e-07, + "loss": 0.001, + "num_tokens": 23475891.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.577684998512268, + "sampling/importance_sampling_ratio/mean": 0.9993813037872314, + "sampling/importance_sampling_ratio/min": 0.5581960082054138, + "sampling/sampling_logp_difference/max": 0.5830450057983398, + "sampling/sampling_logp_difference/mean": 0.014570602215826511, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 188.265625, + "completions/mean_terminated_length": 188.265625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.24562190473079681, + "epoch": 0.9154411764705882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05687146209703193, + "kl": 0.10551153123378754, + "learning_rate": 8.784044157998809e-07, + "loss": 0.001, + "num_tokens": 23502484.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4801042079925537, + "sampling/importance_sampling_ratio/mean": 0.9999704360961914, + "sampling/importance_sampling_ratio/min": 0.5629025101661682, + "sampling/sampling_logp_difference/max": 0.5746488571166992, + "sampling/sampling_logp_difference/mean": 0.012423046864569187, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 236.375, + "completions/mean_terminated_length": 236.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4058482050895691, + "epoch": 0.9166666666666666, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.652487366830316, + "kl": 0.14283131062984467, + "learning_rate": 8.779383949389208e-07, + "loss": -0.0465, + "num_tokens": 23536764.0, + "reward": 0.09375, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.9442535638809204, + "sampling/importance_sampling_ratio/mean": 1.0002634525299072, + "sampling/importance_sampling_ratio/min": 0.03916797414422035, + "sampling/sampling_logp_difference/max": 3.239895820617676, + "sampling/sampling_logp_difference/mean": 0.018831366673111916, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 146.3125, + "completions/mean_terminated_length": 146.3125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.23472276329994202, + "epoch": 0.9178921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.601902455625568, + "kl": 0.10527126491069794, + "learning_rate": 8.774716068869623e-07, + "loss": -0.013, + "num_tokens": 23562448.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4908369779586792, + "sampling/importance_sampling_ratio/mean": 0.9994587302207947, + "sampling/importance_sampling_ratio/min": 0.5684351921081543, + "sampling/sampling_logp_difference/max": 0.5648679733276367, + "sampling/sampling_logp_difference/mean": 0.013005070388317108, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.35371267795562744, + "epoch": 0.9191176470588235, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.912311800476807, + "kl": 0.06903313100337982, + "learning_rate": 8.770040525915553e-07, + "loss": 0.0118, + "num_tokens": 23610000.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8112993240356445, + "sampling/importance_sampling_ratio/mean": 1.0004202127456665, + "sampling/importance_sampling_ratio/min": 0.5983051061630249, + "sampling/sampling_logp_difference/max": 0.5940444469451904, + "sampling/sampling_logp_difference/mean": 0.015579350292682648, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 188.421875, + "completions/mean_terminated_length": 188.421875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.29587602615356445, + "epoch": 0.9203431372549019, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1386399902225393, + "kl": 0.1196604073047638, + "learning_rate": 8.765357330018055e-07, + "loss": -0.0425, + "num_tokens": 23638843.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5930376052856445, + "sampling/importance_sampling_ratio/mean": 1.0000560283660889, + "sampling/importance_sampling_ratio/min": 0.552837610244751, + "sampling/sampling_logp_difference/max": 0.5926909446716309, + "sampling/sampling_logp_difference/mean": 0.014708933420479298, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 245.5625, + "completions/mean_terminated_length": 245.5625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.2548101842403412, + "epoch": 0.9215686274509803, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8398398133402648, + "kl": 0.13636735081672668, + "learning_rate": 8.760666490683719e-07, + "loss": -0.0048, + "num_tokens": 23671215.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6207329034805298, + "sampling/importance_sampling_ratio/mean": 0.9996460676193237, + "sampling/importance_sampling_ratio/min": 0.4399203658103943, + "sampling/sampling_logp_difference/max": 0.8211615085601807, + "sampling/sampling_logp_difference/mean": 0.013017473742365837, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 193.875, + "completions/mean_terminated_length": 193.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3652108907699585, + "epoch": 0.9227941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1424372731621233, + "kl": 0.1297123283147812, + "learning_rate": 8.755968017434651e-07, + "loss": -0.0019, + "num_tokens": 23700487.0, + "reward": -0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5887606143951416, + "sampling/importance_sampling_ratio/mean": 1.0001184940338135, + "sampling/importance_sampling_ratio/min": 0.6151662468910217, + "sampling/sampling_logp_difference/max": 0.48586273193359375, + "sampling/sampling_logp_difference/mean": 0.018013084307312965, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 210.25, + "completions/mean_terminated_length": 210.25, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.27533072233200073, + "epoch": 0.9240196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047112638402082026, + "kl": 0.08654429018497467, + "learning_rate": 8.751261919808457e-07, + "loss": 0.0009, + "num_tokens": 23734519.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7003505229949951, + "sampling/importance_sampling_ratio/mean": 1.0001380443572998, + "sampling/importance_sampling_ratio/min": 0.39048299193382263, + "sampling/sampling_logp_difference/max": 0.9403707981109619, + "sampling/sampling_logp_difference/mean": 0.015848493203520775, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 266.75, + "completions/mean_terminated_length": 266.75, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.20762838423252106, + "epoch": 0.9252450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8807952593732118, + "kl": 0.08133986592292786, + "learning_rate": 8.746548207358215e-07, + "loss": 0.0097, + "num_tokens": 23774951.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000663995742798, + "sampling/importance_sampling_ratio/min": 0.5408735275268555, + "sampling/sampling_logp_difference/max": 1.1860003471374512, + "sampling/sampling_logp_difference/mean": 0.011015929281711578, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 258.359375, + "completions/mean_terminated_length": 258.359375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.40661847591400146, + "epoch": 0.9264705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3951694354347108, + "kl": 0.13240307569503784, + "learning_rate": 8.741826889652463e-07, + "loss": -0.0076, + "num_tokens": 23814590.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4788700342178345, + "sampling/importance_sampling_ratio/mean": 0.9997360110282898, + "sampling/importance_sampling_ratio/min": 0.3830121159553528, + "sampling/sampling_logp_difference/max": 0.9596887230873108, + "sampling/sampling_logp_difference/mean": 0.0171474888920784, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 258.71875, + "completions/mean_terminated_length": 258.71875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.2582932710647583, + "epoch": 0.9276960784313726, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5198818757485997, + "kl": 0.11752104014158249, + "learning_rate": 8.737097976275176e-07, + "loss": 0.0292, + "num_tokens": 23846860.0, + "reward": 0.84375, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6279054880142212, + "sampling/importance_sampling_ratio/mean": 1.0000791549682617, + "sampling/importance_sampling_ratio/min": 0.6158292293548584, + "sampling/sampling_logp_difference/max": 0.48729419708251953, + "sampling/sampling_logp_difference/mean": 0.012493856251239777, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 264.46875, + "completions/mean_terminated_length": 264.46875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.34261465072631836, + "epoch": 0.928921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1119301867776166, + "kl": 0.092156782746315, + "learning_rate": 8.73236147682575e-07, + "loss": 0.0132, + "num_tokens": 23889082.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004080533981323, + "sampling/importance_sampling_ratio/min": 0.43092551827430725, + "sampling/sampling_logp_difference/max": 0.9247453212738037, + "sampling/sampling_logp_difference/mean": 0.01624486595392227, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 276.203125, + "completions/mean_terminated_length": 276.203125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.223912313580513, + "epoch": 0.9301470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3035182604405826, + "kl": 0.07586102187633514, + "learning_rate": 8.727617400918978e-07, + "loss": -0.0102, + "num_tokens": 23927271.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001288652420044, + "sampling/importance_sampling_ratio/min": 0.3184730112552643, + "sampling/sampling_logp_difference/max": 1.144217610359192, + "sampling/sampling_logp_difference/mean": 0.011804968118667603, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 348.734375, + "completions/mean_terminated_length": 348.734375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.30830830335617065, + "epoch": 0.9313725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052828823043434545, + "kl": 0.0683603584766388, + "learning_rate": 8.722865758185035e-07, + "loss": 0.0006, + "num_tokens": 23967398.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000340938568115, + "sampling/importance_sampling_ratio/min": 0.2917536795139313, + "sampling/sampling_logp_difference/max": 1.5527136325836182, + "sampling/sampling_logp_difference/mean": 0.014943273738026619, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 348.96875, + "completions/mean_terminated_length": 348.96875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.32950258255004883, + "epoch": 0.9325980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0608405048500464, + "kl": 0.09067786484956741, + "learning_rate": 8.718106558269452e-07, + "loss": 0.0174, + "num_tokens": 24008692.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998042583465576, + "sampling/importance_sampling_ratio/min": 0.09122932702302933, + "sampling/sampling_logp_difference/max": 2.394378900527954, + "sampling/sampling_logp_difference/mean": 0.014266250655055046, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 182.25, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.2407093495130539, + "epoch": 0.9338235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.049874914237902, + "kl": 0.07820361852645874, + "learning_rate": 8.713339810833105e-07, + "loss": 0.0515, + "num_tokens": 24033780.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007174015045166, + "sampling/importance_sampling_ratio/min": 0.571811854839325, + "sampling/sampling_logp_difference/max": 0.7264003753662109, + "sampling/sampling_logp_difference/mean": 0.014713791199028492, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 275.828125, + "completions/mean_terminated_length": 275.828125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.2686575651168823, + "epoch": 0.9350490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3854418319610282, + "kl": 0.09390707314014435, + "learning_rate": 8.708565525552189e-07, + "loss": -0.0016, + "num_tokens": 24069913.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6036155223846436, + "sampling/importance_sampling_ratio/mean": 0.9999788403511047, + "sampling/importance_sampling_ratio/min": 0.18501514196395874, + "sampling/sampling_logp_difference/max": 1.6873176097869873, + "sampling/sampling_logp_difference/mean": 0.014508314430713654, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 317.296875, + "completions/mean_terminated_length": 317.296875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.328898549079895, + "epoch": 0.9362745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.948734908999543, + "kl": 0.0919274240732193, + "learning_rate": 8.703783712118202e-07, + "loss": -0.008, + "num_tokens": 24113628.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006535053253174, + "sampling/importance_sampling_ratio/min": 0.2388659417629242, + "sampling/sampling_logp_difference/max": 1.4318528175354004, + "sampling/sampling_logp_difference/mean": 0.017194965854287148, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 261.609375, + "completions/mean_terminated_length": 261.609375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.19261127710342407, + "epoch": 0.9375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05829228337489209, + "kl": 0.06861470639705658, + "learning_rate": 8.69899438023792e-07, + "loss": 0.0006, + "num_tokens": 24146531.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999251365661621, + "sampling/importance_sampling_ratio/min": 0.5260617733001709, + "sampling/sampling_logp_difference/max": 0.9457783699035645, + "sampling/sampling_logp_difference/mean": 0.011656357906758785, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 400.125, + "completions/mean_terminated_length": 400.125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.2249448448419571, + "epoch": 0.9387254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5009311419991561, + "kl": 0.06286576390266418, + "learning_rate": 8.694197539633385e-07, + "loss": -0.0244, + "num_tokens": 24192123.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9170788526535034, + "sampling/importance_sampling_ratio/mean": 0.9997783899307251, + "sampling/importance_sampling_ratio/min": 0.4161491394042969, + "sampling/sampling_logp_difference/max": 0.8767116069793701, + "sampling/sampling_logp_difference/mean": 0.010771805420517921, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 228.640625, + "completions/mean_terminated_length": 228.640625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.32457515597343445, + "epoch": 0.9399509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4842880579057736, + "kl": 0.10564848780632019, + "learning_rate": 8.689393200041878e-07, + "loss": 0.0219, + "num_tokens": 24226564.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.689996361732483, + "sampling/importance_sampling_ratio/mean": 1.0003066062927246, + "sampling/importance_sampling_ratio/min": 0.4712963104248047, + "sampling/sampling_logp_difference/max": 0.7522683143615723, + "sampling/sampling_logp_difference/mean": 0.016728565096855164, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 326.328125, + "completions/mean_terminated_length": 326.328125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.3114994466304779, + "epoch": 0.9411764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8765508643097657, + "kl": 0.08524590730667114, + "learning_rate": 8.684581371215904e-07, + "loss": -0.0087, + "num_tokens": 24271401.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.980911374092102, + "sampling/importance_sampling_ratio/mean": 0.9998607039451599, + "sampling/importance_sampling_ratio/min": 0.28986427187919617, + "sampling/sampling_logp_difference/max": 1.238342523574829, + "sampling/sampling_logp_difference/mean": 0.01525900699198246, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 206.6875, + "completions/mean_terminated_length": 206.6875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.26448220014572144, + "epoch": 0.9424019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07077786446123661, + "kl": 0.09754940867424011, + "learning_rate": 8.679762062923175e-07, + "loss": 0.001, + "num_tokens": 24300901.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993366599082947, + "sampling/importance_sampling_ratio/min": 0.5680510997772217, + "sampling/sampling_logp_difference/max": 0.8461496829986572, + "sampling/sampling_logp_difference/mean": 0.014535041525959969, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 275.359375, + "completions/mean_terminated_length": 275.359375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.1998107135295868, + "epoch": 0.9436274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04915317473340469, + "kl": 0.07948745787143707, + "learning_rate": 8.674935284946576e-07, + "loss": 0.0007, + "num_tokens": 24331724.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5431855916976929, + "sampling/importance_sampling_ratio/mean": 0.9998654723167419, + "sampling/importance_sampling_ratio/min": 0.6153616309165955, + "sampling/sampling_logp_difference/max": 0.48554515838623047, + "sampling/sampling_logp_difference/mean": 0.011757083237171173, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 233.375, + "completions/mean_terminated_length": 233.375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.18456797301769257, + "epoch": 0.9448529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057234599373731744, + "kl": 0.06149008870124817, + "learning_rate": 8.670101047084162e-07, + "loss": 0.0006, + "num_tokens": 24361972.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.759658694267273, + "sampling/importance_sampling_ratio/mean": 1.0008488893508911, + "sampling/importance_sampling_ratio/min": 0.5163919925689697, + "sampling/sampling_logp_difference/max": 0.6608891487121582, + "sampling/sampling_logp_difference/mean": 0.010988589376211166, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 213.125, + "completions/mean_terminated_length": 213.125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2671198844909668, + "epoch": 0.946078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1167158123492245, + "kl": 0.07398052513599396, + "learning_rate": 8.66525935914913e-07, + "loss": -0.0101, + "num_tokens": 24390348.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.642784595489502, + "sampling/importance_sampling_ratio/mean": 0.9997448921203613, + "sampling/importance_sampling_ratio/min": 0.6081196665763855, + "sampling/sampling_logp_difference/max": 0.49738359451293945, + "sampling/sampling_logp_difference/mean": 0.014165053144097328, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 243.6875, + "completions/mean_terminated_length": 243.6875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.24403822422027588, + "epoch": 0.9473039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.227076431898468, + "kl": 0.07924221456050873, + "learning_rate": 8.660410230969804e-07, + "loss": 0.0109, + "num_tokens": 24422008.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9937978982925415, + "sampling/importance_sampling_ratio/mean": 0.9996981620788574, + "sampling/importance_sampling_ratio/min": 0.5999762415885925, + "sampling/sampling_logp_difference/max": 0.6900413036346436, + "sampling/sampling_logp_difference/mean": 0.013857114128768444, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 213.59375, + "completions/mean_terminated_length": 213.59375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.23578113317489624, + "epoch": 0.9485294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0583291347233092, + "kl": 0.07998788356781006, + "learning_rate": 8.655553672389599e-07, + "loss": 0.0008, + "num_tokens": 24451822.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5902295112609863, + "sampling/importance_sampling_ratio/mean": 0.9997382164001465, + "sampling/importance_sampling_ratio/min": 0.4708743989467621, + "sampling/sampling_logp_difference/max": 0.7531639337539673, + "sampling/sampling_logp_difference/mean": 0.014234479516744614, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.27442288398742676, + "epoch": 0.9497549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22505439861754856, + "kl": 0.11774364113807678, + "learning_rate": 8.650689693267026e-07, + "loss": 0.0012, + "num_tokens": 24486262.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995816946029663, + "sampling/importance_sampling_ratio/min": 0.26457977294921875, + "sampling/sampling_logp_difference/max": 1.3296124935150146, + "sampling/sampling_logp_difference/mean": 0.01688530668616295, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 226.0625, + "completions/mean_terminated_length": 226.0625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.25847139954566956, + "epoch": 0.9509803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1638385298942386, + "kl": 0.11118605732917786, + "learning_rate": 8.645818303475654e-07, + "loss": 0.0134, + "num_tokens": 24518186.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000941753387451, + "sampling/importance_sampling_ratio/min": 0.4972367584705353, + "sampling/sampling_logp_difference/max": 2.187211036682129, + "sampling/sampling_logp_difference/mean": 0.014578125439584255, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 222.609375, + "completions/mean_terminated_length": 222.609375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.24409694969654083, + "epoch": 0.9522058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.687171589855556, + "kl": 0.09179659187793732, + "learning_rate": 8.640939512904095e-07, + "loss": -0.0488, + "num_tokens": 24552257.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.7985886335372925, + "sampling/importance_sampling_ratio/mean": 0.9997256994247437, + "sampling/importance_sampling_ratio/min": 0.4481438100337982, + "sampling/sampling_logp_difference/max": 0.8026411533355713, + "sampling/sampling_logp_difference/mean": 0.014489128254354, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 186.90625, + "completions/mean_terminated_length": 186.90625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.22848191857337952, + "epoch": 0.9534313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1136037290768883, + "kl": 0.08895926177501678, + "learning_rate": 8.636053331455986e-07, + "loss": 0.0008, + "num_tokens": 24582123.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6115034818649292, + "sampling/importance_sampling_ratio/mean": 0.9996715784072876, + "sampling/importance_sampling_ratio/min": 0.273299902677536, + "sampling/sampling_logp_difference/max": 1.2971855401992798, + "sampling/sampling_logp_difference/mean": 0.01470221672207117, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 211.078125, + "completions/mean_terminated_length": 211.078125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.23104828596115112, + "epoch": 0.9546568627450981, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4125139299465415, + "kl": 0.06623545289039612, + "learning_rate": 8.631159769049964e-07, + "loss": -0.003, + "num_tokens": 24616768.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8640655279159546, + "sampling/importance_sampling_ratio/mean": 1.000211477279663, + "sampling/importance_sampling_ratio/min": 0.26969853043556213, + "sampling/sampling_logp_difference/max": 1.310450553894043, + "sampling/sampling_logp_difference/mean": 0.014722302556037903, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 198.875, + "completions/mean_terminated_length": 198.875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.294430136680603, + "epoch": 0.9558823529411765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09517564557086731, + "kl": 0.1045638769865036, + "learning_rate": 8.626258835619653e-07, + "loss": 0.0011, + "num_tokens": 24645704.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6792397499084473, + "sampling/importance_sampling_ratio/mean": 0.9997049570083618, + "sampling/importance_sampling_ratio/min": 0.2991315722465515, + "sampling/sampling_logp_difference/max": 1.206871747970581, + "sampling/sampling_logp_difference/mean": 0.016724448651075363, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 335.984375, + "completions/mean_terminated_length": 335.984375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.28677886724472046, + "epoch": 0.9571078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3262884170193108, + "kl": 0.07529214024543762, + "learning_rate": 8.621350541113636e-07, + "loss": 0.0445, + "num_tokens": 24685319.0, + "reward": 0.0625, + "reward_std": 0.5123475193977356, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7604070901870728, + "sampling/importance_sampling_ratio/mean": 0.9992750287055969, + "sampling/importance_sampling_ratio/min": 0.25306573510169983, + "sampling/sampling_logp_difference/max": 1.3741059303283691, + "sampling/sampling_logp_difference/mean": 0.014710287563502789, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.28004491329193115, + "epoch": 0.9583333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5659166278465684, + "kl": 0.07568037509918213, + "learning_rate": 8.616434895495439e-07, + "loss": 0.0382, + "num_tokens": 24710551.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5768544673919678, + "sampling/importance_sampling_ratio/mean": 0.9995042681694031, + "sampling/importance_sampling_ratio/min": 0.6098422408103943, + "sampling/sampling_logp_difference/max": 0.4945549964904785, + "sampling/sampling_logp_difference/mean": 0.014809216372668743, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 228.734375, + "completions/mean_terminated_length": 228.734375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.27852103114128113, + "epoch": 0.9595588235294118, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7220481070785896, + "kl": 0.08494658768177032, + "learning_rate": 8.611511908743514e-07, + "loss": -0.0265, + "num_tokens": 24739846.0, + "reward": 0.25, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.607839584350586, + "sampling/importance_sampling_ratio/mean": 0.9999562501907349, + "sampling/importance_sampling_ratio/min": 0.4443697929382324, + "sampling/sampling_logp_difference/max": 0.8110982179641724, + "sampling/sampling_logp_difference/mean": 0.014362575486302376, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 194.84375, + "completions/mean_terminated_length": 194.84375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.22808068990707397, + "epoch": 0.9607843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3911844185995597, + "kl": 0.16934823989868164, + "learning_rate": 8.606581590851208e-07, + "loss": 0.011, + "num_tokens": 24767356.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005059242248535, + "sampling/importance_sampling_ratio/min": 0.44347065687179565, + "sampling/sampling_logp_difference/max": 0.9142541885375977, + "sampling/sampling_logp_difference/mean": 0.013863971456885338, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 196.09375, + "completions/mean_terminated_length": 196.09375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.26508060097694397, + "epoch": 0.9620098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4015269869062008, + "kl": 0.09637516736984253, + "learning_rate": 8.601643951826758e-07, + "loss": -0.0127, + "num_tokens": 24799634.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003741979599, + "sampling/importance_sampling_ratio/min": 0.3292045593261719, + "sampling/sampling_logp_difference/max": 1.2245583534240723, + "sampling/sampling_logp_difference/mean": 0.015394306741654873, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 173.09375, + "completions/mean_terminated_length": 173.09375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2639704942703247, + "epoch": 0.9632352941176471, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4386105462370795, + "kl": 0.09897458553314209, + "learning_rate": 8.596699001693255e-07, + "loss": 0.0098, + "num_tokens": 24826968.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000314712524414, + "sampling/importance_sampling_ratio/min": 0.5619944930076599, + "sampling/sampling_logp_difference/max": 0.7084627151489258, + "sampling/sampling_logp_difference/mean": 0.01590142212808132, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 212.921875, + "completions/mean_terminated_length": 212.921875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.24254243075847626, + "epoch": 0.9644607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05603446342313128, + "kl": 0.10858361423015594, + "learning_rate": 8.591746750488637e-07, + "loss": 0.0009, + "num_tokens": 24860307.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6227291822433472, + "sampling/importance_sampling_ratio/mean": 0.9999048113822937, + "sampling/importance_sampling_ratio/min": 0.5034974217414856, + "sampling/sampling_logp_difference/max": 0.6861767768859863, + "sampling/sampling_logp_difference/mean": 0.014957180246710777, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 172.734375, + "completions/mean_terminated_length": 172.734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.23348455131053925, + "epoch": 0.9656862745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.5485027589058595, + "kl": 0.09483473002910614, + "learning_rate": 8.58678720826566e-07, + "loss": 0.0051, + "num_tokens": 24888482.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000526905059814, + "sampling/importance_sampling_ratio/min": 0.4056454002857208, + "sampling/sampling_logp_difference/max": 0.9022759199142456, + "sampling/sampling_logp_difference/mean": 0.014128241688013077, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 228.015625, + "completions/mean_terminated_length": 228.015625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.18648967146873474, + "epoch": 0.9669117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0333494594034551, + "kl": 0.052099764347076416, + "learning_rate": 8.58182038509188e-07, + "loss": -0.0773, + "num_tokens": 24920851.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.606330394744873, + "sampling/importance_sampling_ratio/mean": 0.9999731183052063, + "sampling/importance_sampling_ratio/min": 0.6468685269355774, + "sampling/sampling_logp_difference/max": 0.4739522933959961, + "sampling/sampling_logp_difference/mean": 0.010594777762889862, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 203.140625, + "completions/mean_terminated_length": 203.140625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.20786339044570923, + "epoch": 0.9681372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.728660722184858, + "kl": 0.1304941177368164, + "learning_rate": 8.576846291049633e-07, + "loss": 0.0276, + "num_tokens": 24953852.0, + "reward": 0.65625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6535061597824097, + "sampling/importance_sampling_ratio/mean": 0.99996417760849, + "sampling/importance_sampling_ratio/min": 0.39900103211402893, + "sampling/sampling_logp_difference/max": 0.9187912940979004, + "sampling/sampling_logp_difference/mean": 0.013810346834361553, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 249.03125, + "completions/mean_terminated_length": 249.03125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3435039520263672, + "epoch": 0.9693627450980392, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.693010139356701, + "kl": 0.16828186810016632, + "learning_rate": 8.571864936236015e-07, + "loss": -0.0077, + "num_tokens": 24984206.0, + "reward": -0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007719993591309, + "sampling/importance_sampling_ratio/min": 0.5251613259315491, + "sampling/sampling_logp_difference/max": 0.7147760391235352, + "sampling/sampling_logp_difference/mean": 0.016702715307474136, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 196.78125, + "completions/mean_terminated_length": 196.78125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.22374488413333893, + "epoch": 0.9705882352941176, + "frac_reward_zero_std": 0.75, + "grad_norm": 18.92520845751321, + "kl": 0.09307081997394562, + "learning_rate": 8.56687633076286e-07, + "loss": 0.0112, + "num_tokens": 25013152.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.89866304397583, + "sampling/importance_sampling_ratio/mean": 1.0002086162567139, + "sampling/importance_sampling_ratio/min": 0.5483723282814026, + "sampling/sampling_logp_difference/max": 0.6411499977111816, + "sampling/sampling_logp_difference/mean": 0.013037864118814468, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 175.953125, + "completions/mean_terminated_length": 175.953125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.26841437816619873, + "epoch": 0.9718137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.020253246235909, + "kl": 0.15671822428703308, + "learning_rate": 8.561880484756724e-07, + "loss": 0.0147, + "num_tokens": 25044749.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995671510696411, + "sampling/importance_sampling_ratio/min": 0.33376604318618774, + "sampling/sampling_logp_difference/max": 1.305298089981079, + "sampling/sampling_logp_difference/mean": 0.01966940611600876, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 246.671875, + "completions/mean_terminated_length": 246.671875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2716260254383087, + "epoch": 0.9730392156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4791742571323687, + "kl": 0.06522978842258453, + "learning_rate": 8.556877408358854e-07, + "loss": -0.031, + "num_tokens": 25076616.0, + "reward": -0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 0.9999372959136963, + "sampling/importance_sampling_ratio/min": 0.4611970782279968, + "sampling/sampling_logp_difference/max": 0.7739298343658447, + "sampling/sampling_logp_difference/mean": 0.014122720807790756, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 224.296875, + "completions/mean_terminated_length": 224.296875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.331301748752594, + "epoch": 0.9742647058823529, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.55834443887189, + "kl": 0.10484770685434341, + "learning_rate": 8.551867111725182e-07, + "loss": -0.015, + "num_tokens": 25107419.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6215503215789795, + "sampling/importance_sampling_ratio/mean": 1.0009255409240723, + "sampling/importance_sampling_ratio/min": 0.6292675733566284, + "sampling/sampling_logp_difference/max": 0.4833827018737793, + "sampling/sampling_logp_difference/mean": 0.01644054800271988, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 233.515625, + "completions/mean_terminated_length": 233.515625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.31792929768562317, + "epoch": 0.9754901960784313, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3828992801403757, + "kl": 0.1063152477145195, + "learning_rate": 8.546849605026288e-07, + "loss": -0.007, + "num_tokens": 25143484.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 0.9995904564857483, + "sampling/importance_sampling_ratio/min": 0.2538815140724182, + "sampling/sampling_logp_difference/max": 1.3708875179290771, + "sampling/sampling_logp_difference/mean": 0.017424102872610092, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 223.65625, + "completions/mean_terminated_length": 223.65625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.31672942638397217, + "epoch": 0.9767156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1051758347447356, + "kl": 0.09138567000627518, + "learning_rate": 8.541824898447397e-07, + "loss": 0.0053, + "num_tokens": 25179574.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006200075149536, + "sampling/importance_sampling_ratio/min": 0.2614830732345581, + "sampling/sampling_logp_difference/max": 1.3413857221603394, + "sampling/sampling_logp_difference/mean": 0.016937199980020523, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 228.671875, + "completions/mean_terminated_length": 228.671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.25868871808052063, + "epoch": 0.9779411764705882, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6750072897039248, + "kl": 0.09289942681789398, + "learning_rate": 8.536793002188343e-07, + "loss": -0.0885, + "num_tokens": 25212033.0, + "reward": 0.21875, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6071306467056274, + "sampling/importance_sampling_ratio/mean": 0.9996358752250671, + "sampling/importance_sampling_ratio/min": 0.3208919167518616, + "sampling/sampling_logp_difference/max": 1.1366509199142456, + "sampling/sampling_logp_difference/mean": 0.01383034698665142, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 197.484375, + "completions/mean_terminated_length": 197.484375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.33253058791160583, + "epoch": 0.9791666666666666, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3666806788141803, + "kl": 0.10231654345989227, + "learning_rate": 8.531753926463556e-07, + "loss": -0.0154, + "num_tokens": 25249680.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.7784098386764526, + "sampling/importance_sampling_ratio/mean": 0.9992489814758301, + "sampling/importance_sampling_ratio/min": 0.4159637987613678, + "sampling/sampling_logp_difference/max": 0.8771570920944214, + "sampling/sampling_logp_difference/mean": 0.01705138012766838, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 292.75, + "completions/mean_terminated_length": 292.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.3406507670879364, + "epoch": 0.9803921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3752395077662523, + "kl": 0.09344251453876495, + "learning_rate": 8.526707681502043e-07, + "loss": -0.0142, + "num_tokens": 25297120.0, + "reward": 0.40625, + "reward_std": 0.497555673122406, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.7945382595062256, + "sampling/importance_sampling_ratio/mean": 1.0003046989440918, + "sampling/importance_sampling_ratio/min": 0.39636990427970886, + "sampling/sampling_logp_difference/max": 0.9254074096679688, + "sampling/sampling_logp_difference/mean": 0.015919357538223267, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 201.15625, + "completions/mean_terminated_length": 201.15625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.22913354635238647, + "epoch": 0.9816176470588235, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5541191621953963, + "kl": 0.08892589807510376, + "learning_rate": 8.521654277547361e-07, + "loss": 0.0113, + "num_tokens": 25330314.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002615451812744, + "sampling/importance_sampling_ratio/min": 0.43362876772880554, + "sampling/sampling_logp_difference/max": 0.835566520690918, + "sampling/sampling_logp_difference/mean": 0.015550851821899414, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 196.515625, + "completions/mean_terminated_length": 196.515625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.27712690830230713, + "epoch": 0.9828431372549019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05341884815081776, + "kl": 0.09480875730514526, + "learning_rate": 8.516593724857597e-07, + "loss": 0.0009, + "num_tokens": 25361051.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8076027631759644, + "sampling/importance_sampling_ratio/mean": 0.9997462034225464, + "sampling/importance_sampling_ratio/min": 0.4924623370170593, + "sampling/sampling_logp_difference/max": 0.7083373069763184, + "sampling/sampling_logp_difference/mean": 0.01574290730059147, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 224.609375, + "completions/mean_terminated_length": 224.609375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.32985979318618774, + "epoch": 0.9840686274509803, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6215617047141229, + "kl": 0.13667967915534973, + "learning_rate": 8.511526033705356e-07, + "loss": 0.001, + "num_tokens": 25393330.0, + "reward": 0.25, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.7628861665725708, + "sampling/importance_sampling_ratio/mean": 0.9999837279319763, + "sampling/importance_sampling_ratio/min": 0.5363121628761292, + "sampling/sampling_logp_difference/max": 0.6230388879776001, + "sampling/sampling_logp_difference/mean": 0.017135314643383026, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1467.0, + "completions/max_terminated_length": 1467.0, + "completions/mean_length": 259.96875, + "completions/mean_terminated_length": 259.96875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.26032283902168274, + "epoch": 0.9852941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9860506551774538, + "kl": 0.08530402183532715, + "learning_rate": 8.506451214377728e-07, + "loss": 0.066, + "num_tokens": 25426640.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.9493076801300049, + "sampling/importance_sampling_ratio/mean": 0.9999415874481201, + "sampling/importance_sampling_ratio/min": 0.47820186614990234, + "sampling/sampling_logp_difference/max": 0.7377223968505859, + "sampling/sampling_logp_difference/mean": 0.015485418029129505, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 199.078125, + "completions/mean_terminated_length": 199.078125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2238267958164215, + "epoch": 0.9865196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061919552971392476, + "kl": 0.09052140265703201, + "learning_rate": 8.501369277176274e-07, + "loss": 0.0009, + "num_tokens": 25461781.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000416040420532, + "sampling/importance_sampling_ratio/min": 0.476005882024765, + "sampling/sampling_logp_difference/max": 1.229488730430603, + "sampling/sampling_logp_difference/mean": 0.01378423348069191, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.24778124690055847, + "epoch": 0.9877450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1377328553002952, + "kl": 0.11969804763793945, + "learning_rate": 8.496280232417007e-07, + "loss": 0.0112, + "num_tokens": 25499621.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000567436218262, + "sampling/importance_sampling_ratio/min": 0.43894508481025696, + "sampling/sampling_logp_difference/max": 0.9489450454711914, + "sampling/sampling_logp_difference/mean": 0.014659663662314415, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 241.71875, + "completions/mean_terminated_length": 241.71875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.2911279797554016, + "epoch": 0.9889705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4861659736004562, + "kl": 0.10022599250078201, + "learning_rate": 8.491184090430363e-07, + "loss": 0.0202, + "num_tokens": 25531443.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6779921054840088, + "sampling/importance_sampling_ratio/mean": 1.000156283378601, + "sampling/importance_sampling_ratio/min": 0.3601919114589691, + "sampling/sampling_logp_difference/max": 1.021118402481079, + "sampling/sampling_logp_difference/mean": 0.015354365110397339, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 259.4375, + "completions/mean_terminated_length": 259.4375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.23779773712158203, + "epoch": 0.9901960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0426753369135231, + "kl": 0.0802290141582489, + "learning_rate": 8.48608086156119e-07, + "loss": 0.0007, + "num_tokens": 25567967.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005545616149902, + "sampling/importance_sampling_ratio/min": 0.48663073778152466, + "sampling/sampling_logp_difference/max": 1.4357918500900269, + "sampling/sampling_logp_difference/mean": 0.014127079397439957, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 176.78125, + "completions/mean_terminated_length": 176.78125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.22128558158874512, + "epoch": 0.991421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3516200128811913, + "kl": 0.10866418480873108, + "learning_rate": 8.480970556168717e-07, + "loss": 0.0343, + "num_tokens": 25591345.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6000850200653076, + "sampling/importance_sampling_ratio/mean": 1.0005159378051758, + "sampling/importance_sampling_ratio/min": 0.5678879618644714, + "sampling/sampling_logp_difference/max": 0.565831184387207, + "sampling/sampling_logp_difference/mean": 0.013683602213859558, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 310.96875, + "completions/mean_terminated_length": 310.96875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.31794682145118713, + "epoch": 0.9926470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5039799554179456, + "kl": 0.08307865262031555, + "learning_rate": 8.47585318462654e-07, + "loss": -0.0748, + "num_tokens": 25631343.0, + "reward": -0.1875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6089153289794922, + "sampling/importance_sampling_ratio/mean": 1.0002610683441162, + "sampling/importance_sampling_ratio/min": 0.4608144164085388, + "sampling/sampling_logp_difference/max": 0.7747598886489868, + "sampling/sampling_logp_difference/mean": 0.014763688668608665, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 294.59375, + "completions/mean_terminated_length": 294.59375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.2942918837070465, + "epoch": 0.9938725490196079, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.274001862638374, + "kl": 0.058023639023303986, + "learning_rate": 8.470728757322603e-07, + "loss": 0.0185, + "num_tokens": 25671749.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.65156888961792, + "sampling/importance_sampling_ratio/mean": 1.0002224445343018, + "sampling/importance_sampling_ratio/min": 0.556668758392334, + "sampling/sampling_logp_difference/max": 0.585784912109375, + "sampling/sampling_logp_difference/mean": 0.014879814349114895, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 194.515625, + "completions/mean_terminated_length": 194.515625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.19599872827529907, + "epoch": 0.9950980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4377343040997632, + "kl": 0.10231795907020569, + "learning_rate": 8.465597284659163e-07, + "loss": -0.0092, + "num_tokens": 25697398.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5985956192016602, + "sampling/importance_sampling_ratio/mean": 1.0001569986343384, + "sampling/importance_sampling_ratio/min": 0.6593726277351379, + "sampling/sampling_logp_difference/max": 0.46912550926208496, + "sampling/sampling_logp_difference/mean": 0.011592017486691475, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 251.515625, + "completions/mean_terminated_length": 251.515625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.23744173347949982, + "epoch": 0.9963235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9911021494469198, + "kl": 0.10130766034126282, + "learning_rate": 8.460458777052788e-07, + "loss": 0.0229, + "num_tokens": 25731127.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000287652015686, + "sampling/importance_sampling_ratio/min": 0.3292010724544525, + "sampling/sampling_logp_difference/max": 1.1110866069793701, + "sampling/sampling_logp_difference/mean": 0.012608356773853302, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 270.484375, + "completions/mean_terminated_length": 270.484375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.33210045099258423, + "epoch": 0.9975490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5617785593321691, + "kl": 0.13967446982860565, + "learning_rate": 8.455313244934324e-07, + "loss": 0.0606, + "num_tokens": 25768550.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.7586973905563354, + "sampling/importance_sampling_ratio/mean": 0.9992921352386475, + "sampling/importance_sampling_ratio/min": 0.48241546750068665, + "sampling/sampling_logp_difference/max": 0.7289495468139648, + "sampling/sampling_logp_difference/mean": 0.01595686748623848, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2153.0, + "completions/max_terminated_length": 2153.0, + "completions/mean_length": 367.5625, + "completions/mean_terminated_length": 367.5625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2818757891654968, + "epoch": 0.9987745098039216, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.1640668352284114, + "kl": 0.10220036655664444, + "learning_rate": 8.450160698748871e-07, + "loss": 0.0247, + "num_tokens": 25808106.0, + "reward": 0.40625, + "reward_std": 0.5986068248748779, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.7544389963150024, + "sampling/importance_sampling_ratio/mean": 1.0004475116729736, + "sampling/importance_sampling_ratio/min": 0.6172945499420166, + "sampling/sampling_logp_difference/max": 0.562149167060852, + "sampling/sampling_logp_difference/mean": 0.014506627805531025, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 211.359375, + "completions/mean_terminated_length": 211.359375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2624770402908325, + "epoch": 1.0, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7630636551547239, + "kl": 0.13299322128295898, + "learning_rate": 8.445001148955775e-07, + "loss": 0.0602, + "num_tokens": 25836497.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5277621746063232, + "sampling/importance_sampling_ratio/mean": 0.9998353123664856, + "sampling/importance_sampling_ratio/min": 0.49843165278434753, + "sampling/sampling_logp_difference/max": 0.6962888240814209, + "sampling/sampling_logp_difference/mean": 0.013662048615515232, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 190.421875, + "completions/mean_terminated_length": 190.421875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3429885506629944, + "epoch": 1.0012254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1742526347707032, + "kl": 0.1756715178489685, + "learning_rate": 8.439834606028593e-07, + "loss": 0.0144, + "num_tokens": 25868172.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.9485623836517334, + "sampling/importance_sampling_ratio/mean": 1.0000386238098145, + "sampling/importance_sampling_ratio/min": 0.14807981252670288, + "sampling/sampling_logp_difference/max": 1.910003900527954, + "sampling/sampling_logp_difference/mean": 0.019224446266889572, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 329.390625, + "completions/mean_terminated_length": 329.390625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.2481330931186676, + "epoch": 1.0024509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1708314419743115, + "kl": 0.08997122943401337, + "learning_rate": 8.434661080455082e-07, + "loss": 0.0489, + "num_tokens": 25908165.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001662969589233, + "sampling/importance_sampling_ratio/min": 0.3522055149078369, + "sampling/sampling_logp_difference/max": 1.0435404777526855, + "sampling/sampling_logp_difference/mean": 0.01300597470253706, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 167.546875, + "completions/mean_terminated_length": 167.546875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.18984636664390564, + "epoch": 1.0036764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.68223338459352, + "kl": 0.12585148215293884, + "learning_rate": 8.42948058273717e-07, + "loss": 0.0144, + "num_tokens": 25931880.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6631553173065186, + "sampling/importance_sampling_ratio/mean": 0.999846339225769, + "sampling/importance_sampling_ratio/min": 0.4810182452201843, + "sampling/sampling_logp_difference/max": 0.7318501472473145, + "sampling/sampling_logp_difference/mean": 0.012167340144515038, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 213.171875, + "completions/mean_terminated_length": 213.171875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.29106903076171875, + "epoch": 1.0049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04732077554474528, + "kl": 0.14875267446041107, + "learning_rate": 8.424293123390938e-07, + "loss": 0.0011, + "num_tokens": 25961155.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998732805252075, + "sampling/importance_sampling_ratio/min": 0.5073512196540833, + "sampling/sampling_logp_difference/max": 1.0273196697235107, + "sampling/sampling_logp_difference/mean": 0.016036391258239746, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 199.828125, + "completions/mean_terminated_length": 199.828125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.31752485036849976, + "epoch": 1.0061274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.646825993560745, + "kl": 0.14398261904716492, + "learning_rate": 8.4190987129466e-07, + "loss": -0.0032, + "num_tokens": 25992632.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999907612800598, + "sampling/importance_sampling_ratio/min": 0.38680943846702576, + "sampling/sampling_logp_difference/max": 0.9498231410980225, + "sampling/sampling_logp_difference/mean": 0.018284928053617477, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 240.09375, + "completions/mean_terminated_length": 240.09375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.300971657037735, + "epoch": 1.0073529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.410952001223642, + "kl": 0.08495702594518661, + "learning_rate": 8.413897361948483e-07, + "loss": 0.0394, + "num_tokens": 26025518.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5744410753250122, + "sampling/importance_sampling_ratio/mean": 1.0003352165222168, + "sampling/importance_sampling_ratio/min": 0.5256655216217041, + "sampling/sampling_logp_difference/max": 0.6430901288986206, + "sampling/sampling_logp_difference/mean": 0.015221796929836273, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 267.625, + "completions/mean_terminated_length": 267.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2923009693622589, + "epoch": 1.008578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5553399319783574, + "kl": 0.079134002327919, + "learning_rate": 8.408689080954997e-07, + "loss": 0.03, + "num_tokens": 26063638.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6193829774856567, + "sampling/importance_sampling_ratio/mean": 1.0001189708709717, + "sampling/importance_sampling_ratio/min": 0.522817850112915, + "sampling/sampling_logp_difference/max": 0.648522138595581, + "sampling/sampling_logp_difference/mean": 0.014686044305562973, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 231.703125, + "completions/mean_terminated_length": 231.703125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.23314502835273743, + "epoch": 1.0098039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3422052934420183, + "kl": 0.07978282868862152, + "learning_rate": 8.403473880538625e-07, + "loss": 0.0162, + "num_tokens": 26098547.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001015663146973, + "sampling/importance_sampling_ratio/min": 0.2892739176750183, + "sampling/sampling_logp_difference/max": 1.2403812408447266, + "sampling/sampling_logp_difference/mean": 0.01295526884496212, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 187.78125, + "completions/mean_terminated_length": 187.78125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3091316223144531, + "epoch": 1.0110294117647058, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8914285588470658, + "kl": 0.11706522852182388, + "learning_rate": 8.398251771285892e-07, + "loss": 0.0428, + "num_tokens": 26134501.0, + "reward": 0.75, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.8952840566635132, + "sampling/importance_sampling_ratio/mean": 0.9994056820869446, + "sampling/importance_sampling_ratio/min": 0.28077879548072815, + "sampling/sampling_logp_difference/max": 1.2701880931854248, + "sampling/sampling_logp_difference/mean": 0.017271850258111954, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 265.40625, + "completions/mean_terminated_length": 265.40625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3018098771572113, + "epoch": 1.0122549019607843, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.111976284918549, + "kl": 0.08650106191635132, + "learning_rate": 8.393022763797346e-07, + "loss": -0.0658, + "num_tokens": 26166671.0, + "reward": 0.5, + "reward_std": 0.6393726468086243, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9298874139785767, + "sampling/importance_sampling_ratio/mean": 0.9999447464942932, + "sampling/importance_sampling_ratio/min": 0.308843731880188, + "sampling/sampling_logp_difference/max": 1.174919843673706, + "sampling/sampling_logp_difference/mean": 0.015505645424127579, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 159.0625, + "completions/mean_terminated_length": 159.0625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.2503647804260254, + "epoch": 1.0134803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.527716455285949, + "kl": 0.16784119606018066, + "learning_rate": 8.387786868687548e-07, + "loss": 0.0194, + "num_tokens": 26188771.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8835201263427734, + "sampling/importance_sampling_ratio/mean": 1.0008383989334106, + "sampling/importance_sampling_ratio/min": 0.5911896824836731, + "sampling/sampling_logp_difference/max": 0.6331424713134766, + "sampling/sampling_logp_difference/mean": 0.015132634900510311, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 259.28125, + "completions/mean_terminated_length": 259.28125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3421180844306946, + "epoch": 1.0147058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8193258955723857, + "kl": 0.08924973011016846, + "learning_rate": 8.382544096585026e-07, + "loss": 0.055, + "num_tokens": 26220629.0, + "reward": -0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5263513326644897, + "sampling/importance_sampling_ratio/mean": 1.0001721382141113, + "sampling/importance_sampling_ratio/min": 0.43096065521240234, + "sampling/sampling_logp_difference/max": 0.8417384624481201, + "sampling/sampling_logp_difference/mean": 0.017179038375616074, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 252.625, + "completions/mean_terminated_length": 252.625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.25044241547584534, + "epoch": 1.0159313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04693399444492541, + "kl": 0.05882733687758446, + "learning_rate": 8.37729445813228e-07, + "loss": 0.0006, + "num_tokens": 26256973.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.762977123260498, + "sampling/importance_sampling_ratio/mean": 1.0009088516235352, + "sampling/importance_sampling_ratio/min": 0.4902009963989258, + "sampling/sampling_logp_difference/max": 0.7129397392272949, + "sampling/sampling_logp_difference/mean": 0.014279183931648731, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 208.046875, + "completions/mean_terminated_length": 208.046875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2281491756439209, + "epoch": 1.017156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.511100354633253, + "kl": 0.07732020318508148, + "learning_rate": 8.372037963985741e-07, + "loss": -0.0297, + "num_tokens": 26293424.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6686607599258423, + "sampling/importance_sampling_ratio/mean": 1.0004651546478271, + "sampling/importance_sampling_ratio/min": 0.31634464859962463, + "sampling/sampling_logp_difference/max": 1.1509230136871338, + "sampling/sampling_logp_difference/mean": 0.014121164567768574, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 254.609375, + "completions/mean_terminated_length": 254.609375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.21974888443946838, + "epoch": 1.0183823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05270365694449295, + "kl": 0.07207030057907104, + "learning_rate": 8.366774624815761e-07, + "loss": 0.0007, + "num_tokens": 26330711.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.626305341720581, + "sampling/importance_sampling_ratio/mean": 0.9997680187225342, + "sampling/importance_sampling_ratio/min": 0.28228700160980225, + "sampling/sampling_logp_difference/max": 1.2648309469223022, + "sampling/sampling_logp_difference/mean": 0.013425543904304504, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 166.84375, + "completions/mean_terminated_length": 166.84375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.2337384819984436, + "epoch": 1.0196078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.526393532776338, + "kl": 0.10532155632972717, + "learning_rate": 8.361504451306584e-07, + "loss": -0.0247, + "num_tokens": 26363629.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6090630292892456, + "sampling/importance_sampling_ratio/mean": 1.0000205039978027, + "sampling/importance_sampling_ratio/min": 0.528343677520752, + "sampling/sampling_logp_difference/max": 0.6380083560943604, + "sampling/sampling_logp_difference/mean": 0.013625487685203552, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 203.984375, + "completions/mean_terminated_length": 203.984375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.2802252769470215, + "epoch": 1.0208333333333333, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.084559533474804, + "kl": 0.11052876710891724, + "learning_rate": 8.356227454156328e-07, + "loss": -0.0214, + "num_tokens": 26392588.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.9827526807785034, + "sampling/importance_sampling_ratio/mean": 0.999845564365387, + "sampling/importance_sampling_ratio/min": 0.37227943539619446, + "sampling/sampling_logp_difference/max": 0.9881105422973633, + "sampling/sampling_logp_difference/mean": 0.016211293637752533, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 207.015625, + "completions/mean_terminated_length": 207.015625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.27637845277786255, + "epoch": 1.0220588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.158562184915467, + "kl": 0.09916109591722488, + "learning_rate": 8.350943644076964e-07, + "loss": 0.007, + "num_tokens": 26422253.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004613399505615, + "sampling/importance_sampling_ratio/min": 0.49313226342201233, + "sampling/sampling_logp_difference/max": 0.7555751800537109, + "sampling/sampling_logp_difference/mean": 0.014827284030616283, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 184.21875, + "completions/mean_terminated_length": 184.21875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.210306316614151, + "epoch": 1.0232843137254901, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4539889291402033, + "kl": 0.12316978722810745, + "learning_rate": 8.34565303179429e-07, + "loss": 0.0, + "num_tokens": 26449435.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4506796598434448, + "sampling/importance_sampling_ratio/mean": 1.000178337097168, + "sampling/importance_sampling_ratio/min": 0.6327508687973022, + "sampling/sampling_logp_difference/max": 0.45767855644226074, + "sampling/sampling_logp_difference/mean": 0.01168773416429758, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 199.25, + "completions/mean_terminated_length": 199.25, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3543969988822937, + "epoch": 1.0245098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8453556747320308, + "kl": 0.16368205845355988, + "learning_rate": 8.340355628047917e-07, + "loss": -0.0006, + "num_tokens": 26481067.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.657509684562683, + "sampling/importance_sampling_ratio/mean": 0.9992981553077698, + "sampling/importance_sampling_ratio/min": 0.48332542181015015, + "sampling/sampling_logp_difference/max": 0.7270650863647461, + "sampling/sampling_logp_difference/mean": 0.01765834540128708, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 200.203125, + "completions/mean_terminated_length": 200.203125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.3506966829299927, + "epoch": 1.025735294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057758914875663606, + "kl": 0.0971386507153511, + "learning_rate": 8.335051443591234e-07, + "loss": 0.001, + "num_tokens": 26513000.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6982983350753784, + "sampling/importance_sampling_ratio/mean": 0.9998024106025696, + "sampling/importance_sampling_ratio/min": 0.523671567440033, + "sampling/sampling_logp_difference/max": 0.6468906402587891, + "sampling/sampling_logp_difference/mean": 0.01773739606142044, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 207.4375, + "completions/mean_terminated_length": 207.4375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.1751493513584137, + "epoch": 1.0269607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03497628294846244, + "kl": 0.04738932102918625, + "learning_rate": 8.329740489191405e-07, + "loss": 0.0005, + "num_tokens": 26541892.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9018902778625488, + "sampling/importance_sampling_ratio/mean": 1.0002743005752563, + "sampling/importance_sampling_ratio/min": 0.5053152441978455, + "sampling/sampling_logp_difference/max": 0.6825728416442871, + "sampling/sampling_logp_difference/mean": 0.010526393540203571, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 293.875, + "completions/mean_terminated_length": 293.875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.33903390169143677, + "epoch": 1.0281862745098038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.857413451748275, + "kl": 0.09599026292562485, + "learning_rate": 8.324422775629327e-07, + "loss": 0.069, + "num_tokens": 26582940.0, + "reward": 0.46875, + "reward_std": 0.7129635810852051, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000169277191162, + "sampling/importance_sampling_ratio/min": 0.42311522364616394, + "sampling/sampling_logp_difference/max": 1.192920446395874, + "sampling/sampling_logp_difference/mean": 0.016587097197771072, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 154.65625, + "completions/mean_terminated_length": 154.65625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.18241757154464722, + "epoch": 1.0294117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0584323257739424, + "kl": 0.08496911823749542, + "learning_rate": 8.319098313699624e-07, + "loss": 0.0009, + "num_tokens": 26612262.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7230578660964966, + "sampling/importance_sampling_ratio/mean": 0.9996199607849121, + "sampling/importance_sampling_ratio/min": 0.5690866708755493, + "sampling/sampling_logp_difference/max": 0.5637224912643433, + "sampling/sampling_logp_difference/mean": 0.012194900773465633, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 223.96875, + "completions/mean_terminated_length": 223.96875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3323877453804016, + "epoch": 1.0306372549019607, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6835755773075476, + "kl": 0.0954112634062767, + "learning_rate": 8.313767114210615e-07, + "loss": 0.0106, + "num_tokens": 26653860.0, + "reward": 0.75, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.684166431427002, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.6183339953422546, + "sampling/sampling_logp_difference/max": 0.521270751953125, + "sampling/sampling_logp_difference/mean": 0.015068236738443375, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 176.40625, + "completions/mean_terminated_length": 176.40625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.22423449158668518, + "epoch": 1.031862745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.41860184529582, + "kl": 0.12526249885559082, + "learning_rate": 8.308429187984298e-07, + "loss": 0.0318, + "num_tokens": 26679870.0, + "reward": 0.75, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6323018074035645, + "sampling/importance_sampling_ratio/mean": 1.000128149986267, + "sampling/importance_sampling_ratio/min": 0.36600685119628906, + "sampling/sampling_logp_difference/max": 1.0051032304763794, + "sampling/sampling_logp_difference/mean": 0.014712570235133171, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.3061140775680542, + "epoch": 1.0330882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7385203158897113, + "kl": 0.08549848198890686, + "learning_rate": 8.303084545856322e-07, + "loss": 0.0283, + "num_tokens": 26723412.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.7838666439056396, + "sampling/importance_sampling_ratio/mean": 1.0001444816589355, + "sampling/importance_sampling_ratio/min": 0.4810205101966858, + "sampling/sampling_logp_difference/max": 0.7318453788757324, + "sampling/sampling_logp_difference/mean": 0.015281646512448788, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 277.90625, + "completions/mean_terminated_length": 277.90625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2246057689189911, + "epoch": 1.0343137254901962, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8822044129650618, + "kl": 0.09816767275333405, + "learning_rate": 8.297733198675977e-07, + "loss": 0.0072, + "num_tokens": 26763022.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005570650100708, + "sampling/importance_sampling_ratio/min": 0.48236799240112305, + "sampling/sampling_logp_difference/max": 0.8305244445800781, + "sampling/sampling_logp_difference/mean": 0.013231072574853897, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 197.53125, + "completions/mean_terminated_length": 197.53125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3606891632080078, + "epoch": 1.0355392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9486747923437764, + "kl": 0.13007526099681854, + "learning_rate": 8.292375157306155e-07, + "loss": 0.006, + "num_tokens": 26795600.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5576754808425903, + "sampling/importance_sampling_ratio/mean": 0.9999874830245972, + "sampling/importance_sampling_ratio/min": 0.6090654134750366, + "sampling/sampling_logp_difference/max": 0.49582958221435547, + "sampling/sampling_logp_difference/mean": 0.01631821319460869, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 193.78125, + "completions/mean_terminated_length": 193.78125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.21424145996570587, + "epoch": 1.036764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.170578415400301, + "kl": 0.0991288274526596, + "learning_rate": 8.287010432623343e-07, + "loss": 0.0082, + "num_tokens": 26823602.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6255465745925903, + "sampling/importance_sampling_ratio/mean": 0.9990866184234619, + "sampling/importance_sampling_ratio/min": 0.3958947956562042, + "sampling/sampling_logp_difference/max": 0.9266067743301392, + "sampling/sampling_logp_difference/mean": 0.014311027713119984, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 208.078125, + "completions/mean_terminated_length": 208.078125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.19055885076522827, + "epoch": 1.0379901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8379166457781339, + "kl": 0.12437820434570312, + "learning_rate": 8.281639035517591e-07, + "loss": -0.0026, + "num_tokens": 26851303.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6019763946533203, + "sampling/importance_sampling_ratio/mean": 0.9999380707740784, + "sampling/importance_sampling_ratio/min": 0.3743131160736084, + "sampling/sampling_logp_difference/max": 0.9826626777648926, + "sampling/sampling_logp_difference/mean": 0.011997069232165813, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 187.625, + "completions/mean_terminated_length": 187.625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.18193748593330383, + "epoch": 1.0392156862745099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08189642669918944, + "kl": 0.0676099956035614, + "learning_rate": 8.276260976892495e-07, + "loss": 0.0006, + "num_tokens": 26885919.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999856352806091, + "sampling/importance_sampling_ratio/min": 0.4021023213863373, + "sampling/sampling_logp_difference/max": 0.9808192253112793, + "sampling/sampling_logp_difference/mean": 0.014291755855083466, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 234.75, + "completions/mean_terminated_length": 234.75, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.23510439693927765, + "epoch": 1.0404411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05287333372937939, + "kl": 0.09308743476867676, + "learning_rate": 8.270876267665173e-07, + "loss": 0.0008, + "num_tokens": 26921551.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.678462266921997, + "sampling/importance_sampling_ratio/mean": 1.0004521608352661, + "sampling/importance_sampling_ratio/min": 0.3964764177799225, + "sampling/sampling_logp_difference/max": 0.9251387715339661, + "sampling/sampling_logp_difference/mean": 0.014521561563014984, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 190.859375, + "completions/mean_terminated_length": 190.859375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2246999889612198, + "epoch": 1.0416666666666667, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.688193795113662, + "kl": 0.1231057196855545, + "learning_rate": 8.265484918766242e-07, + "loss": -0.0245, + "num_tokens": 26946950.0, + "reward": -0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6586134433746338, + "sampling/importance_sampling_ratio/mean": 0.9999523758888245, + "sampling/importance_sampling_ratio/min": 0.43952205777168274, + "sampling/sampling_logp_difference/max": 0.822067379951477, + "sampling/sampling_logp_difference/mean": 0.01355016976594925, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 174.859375, + "completions/mean_terminated_length": 174.859375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.24015727639198303, + "epoch": 1.0428921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.937277069742771, + "kl": 0.10738003253936768, + "learning_rate": 8.260086941139804e-07, + "loss": 0.0468, + "num_tokens": 26981453.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998477697372437, + "sampling/importance_sampling_ratio/min": 0.349877268075943, + "sampling/sampling_logp_difference/max": 1.4946722984313965, + "sampling/sampling_logp_difference/mean": 0.017771970480680466, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 166.984375, + "completions/mean_terminated_length": 166.984375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3333693742752075, + "epoch": 1.0441176470588236, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1691347170068873, + "kl": 0.25931957364082336, + "learning_rate": 8.254682345743405e-07, + "loss": -0.0161, + "num_tokens": 27008732.0, + "reward": 0.0, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006260871887207, + "sampling/importance_sampling_ratio/min": 0.3316207826137543, + "sampling/sampling_logp_difference/max": 1.1143497228622437, + "sampling/sampling_logp_difference/mean": 0.019735675305128098, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 234.78125, + "completions/mean_terminated_length": 234.78125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.23807714879512787, + "epoch": 1.045343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9915526582208206, + "kl": 0.07801719009876251, + "learning_rate": 8.249271143548036e-07, + "loss": -0.0117, + "num_tokens": 27043742.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002737045288086, + "sampling/importance_sampling_ratio/min": 0.3906378149986267, + "sampling/sampling_logp_difference/max": 1.68257474899292, + "sampling/sampling_logp_difference/mean": 0.014691174030303955, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 224.8125, + "completions/mean_terminated_length": 224.8125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.24750757217407227, + "epoch": 1.0465686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1995606231439424, + "kl": 0.08733342587947845, + "learning_rate": 8.243853345538093e-07, + "loss": 0.003, + "num_tokens": 27080866.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.817976713180542, + "sampling/importance_sampling_ratio/mean": 0.9999638199806213, + "sampling/importance_sampling_ratio/min": 0.2541900873184204, + "sampling/sampling_logp_difference/max": 1.3696728944778442, + "sampling/sampling_logp_difference/mean": 0.015378719195723534, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 230.296875, + "completions/mean_terminated_length": 230.296875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.21870526671409607, + "epoch": 1.0477941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3422440007440832, + "kl": 0.065306156873703, + "learning_rate": 8.238428962711362e-07, + "loss": 0.0003, + "num_tokens": 27112917.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.5593341588974, + "sampling/importance_sampling_ratio/mean": 1.000219702720642, + "sampling/importance_sampling_ratio/min": 0.5015109777450562, + "sampling/sampling_logp_difference/max": 0.6901297569274902, + "sampling/sampling_logp_difference/mean": 0.013234477490186691, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.2828371226787567, + "epoch": 1.0490196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2205511854288067, + "kl": 0.08889597654342651, + "learning_rate": 8.232998006078997e-07, + "loss": 0.1483, + "num_tokens": 27150403.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000083446502686, + "sampling/importance_sampling_ratio/min": 0.10448633879423141, + "sampling/sampling_logp_difference/max": 2.2586989402770996, + "sampling/sampling_logp_difference/mean": 0.015552111901342869, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 219.546875, + "completions/mean_terminated_length": 219.546875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.16488586366176605, + "epoch": 1.0502450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18047760725524792, + "kl": 0.08904959261417389, + "learning_rate": 8.227560486665498e-07, + "loss": 0.0008, + "num_tokens": 27181606.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006948709487915, + "sampling/importance_sampling_ratio/min": 0.4159519672393799, + "sampling/sampling_logp_difference/max": 0.8771854639053345, + "sampling/sampling_logp_difference/mean": 0.011130171827971935, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 159.03125, + "completions/mean_terminated_length": 159.03125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.22817040979862213, + "epoch": 1.0514705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2812865168860976, + "kl": 0.16115710139274597, + "learning_rate": 8.222116415508682e-07, + "loss": 0.0049, + "num_tokens": 27207736.0, + "reward": -0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.7246307134628296, + "sampling/importance_sampling_ratio/mean": 1.0001046657562256, + "sampling/importance_sampling_ratio/min": 0.39628320932388306, + "sampling/sampling_logp_difference/max": 0.9256261587142944, + "sampling/sampling_logp_difference/mean": 0.015050392597913742, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 185.578125, + "completions/mean_terminated_length": 185.578125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.15592145919799805, + "epoch": 1.0526960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13370403710709478, + "kl": 0.06281337887048721, + "learning_rate": 8.21666580365967e-07, + "loss": 0.0007, + "num_tokens": 27243213.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000174045562744, + "sampling/importance_sampling_ratio/min": 0.46774056553840637, + "sampling/sampling_logp_difference/max": 0.7598414421081543, + "sampling/sampling_logp_difference/mean": 0.011581188067793846, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 199.828125, + "completions/mean_terminated_length": 199.828125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.2729835510253906, + "epoch": 1.053921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09925805994288021, + "kl": 0.0949406623840332, + "learning_rate": 8.211208662182858e-07, + "loss": 0.0009, + "num_tokens": 27277122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5280382633209229, + "sampling/importance_sampling_ratio/mean": 0.9983392953872681, + "sampling/importance_sampling_ratio/min": 0.32965970039367676, + "sampling/sampling_logp_difference/max": 1.1096943616867065, + "sampling/sampling_logp_difference/mean": 0.01671024225652218, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 148.546875, + "completions/mean_terminated_length": 148.546875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.24493072926998138, + "epoch": 1.0551470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11044530073772146, + "kl": 0.11993096768856049, + "learning_rate": 8.205745002155899e-07, + "loss": 0.0012, + "num_tokens": 27305077.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7267796993255615, + "sampling/importance_sampling_ratio/mean": 0.9999323487281799, + "sampling/importance_sampling_ratio/min": 0.4598299264907837, + "sampling/sampling_logp_difference/max": 0.7768986225128174, + "sampling/sampling_logp_difference/mean": 0.017437398433685303, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 182.1875, + "completions/mean_terminated_length": 182.1875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.22966301441192627, + "epoch": 1.0563725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.735345885320772, + "kl": 0.08656550943851471, + "learning_rate": 8.200274834669675e-07, + "loss": 0.0161, + "num_tokens": 27332001.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5988582372665405, + "sampling/importance_sampling_ratio/mean": 1.0000367164611816, + "sampling/importance_sampling_ratio/min": 0.5221561789512634, + "sampling/sampling_logp_difference/max": 0.6497886180877686, + "sampling/sampling_logp_difference/mean": 0.013142306357622147, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 182.453125, + "completions/mean_terminated_length": 182.453125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.20047461986541748, + "epoch": 1.0575980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6184519494389003, + "kl": 0.06266318261623383, + "learning_rate": 8.194798170828279e-07, + "loss": 0.1024, + "num_tokens": 27363582.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9467194080352783, + "sampling/importance_sampling_ratio/mean": 0.9996281266212463, + "sampling/importance_sampling_ratio/min": 0.35779374837875366, + "sampling/sampling_logp_difference/max": 1.0277986526489258, + "sampling/sampling_logp_difference/mean": 0.014652173034846783, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 221.09375, + "completions/mean_terminated_length": 221.09375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.1975773274898529, + "epoch": 1.0588235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2620075077714465, + "kl": 0.07988125085830688, + "learning_rate": 8.189315021748993e-07, + "loss": 0.0345, + "num_tokens": 27394596.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.7520551681518555, + "sampling/importance_sampling_ratio/mean": 0.9999421238899231, + "sampling/importance_sampling_ratio/min": 0.5676584839820862, + "sampling/sampling_logp_difference/max": 0.5662353038787842, + "sampling/sampling_logp_difference/mean": 0.012207001447677612, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 237.0625, + "completions/mean_terminated_length": 237.0625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.24505576491355896, + "epoch": 1.0600490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23068271821984426, + "kl": 0.07782909274101257, + "learning_rate": 8.183825398562263e-07, + "loss": 0.0007, + "num_tokens": 27427560.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7276512384414673, + "sampling/importance_sampling_ratio/mean": 0.9997975826263428, + "sampling/importance_sampling_ratio/min": 0.47389957308769226, + "sampling/sampling_logp_difference/max": 0.7467598915100098, + "sampling/sampling_logp_difference/mean": 0.01503431424498558, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 131.546875, + "completions/mean_terminated_length": 131.546875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.24680377542972565, + "epoch": 1.0612745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13161481041915632, + "kl": 0.10586653649806976, + "learning_rate": 8.178329312411676e-07, + "loss": 0.0011, + "num_tokens": 27454363.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6429011821746826, + "sampling/importance_sampling_ratio/mean": 1.0001440048217773, + "sampling/importance_sampling_ratio/min": 0.4956373870372772, + "sampling/sampling_logp_difference/max": 0.7019107341766357, + "sampling/sampling_logp_difference/mean": 0.015495985746383667, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 182.8125, + "completions/mean_terminated_length": 182.8125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.208645761013031, + "epoch": 1.0625, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.006884775754296, + "kl": 0.08351355791091919, + "learning_rate": 8.172826774453936e-07, + "loss": 0.043, + "num_tokens": 27478399.0, + "reward": -0.3125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.8095623254776, + "sampling/importance_sampling_ratio/mean": 0.9995898008346558, + "sampling/importance_sampling_ratio/min": 0.6079089045524597, + "sampling/sampling_logp_difference/max": 0.5930850505828857, + "sampling/sampling_logp_difference/mean": 0.01453950721770525, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 167.921875, + "completions/mean_terminated_length": 167.921875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2757169008255005, + "epoch": 1.0637254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8902161735480165, + "kl": 0.11209302395582199, + "learning_rate": 8.16731779585885e-07, + "loss": 0.0087, + "num_tokens": 27513754.0, + "reward": -0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000875473022461, + "sampling/importance_sampling_ratio/min": 0.3849431276321411, + "sampling/sampling_logp_difference/max": 0.9546597003936768, + "sampling/sampling_logp_difference/mean": 0.018580183386802673, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 167.03125, + "completions/mean_terminated_length": 167.03125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.21587997674942017, + "epoch": 1.0649509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1420870386111802, + "kl": 0.08559355139732361, + "learning_rate": 8.161802387809292e-07, + "loss": 0.0008, + "num_tokens": 27542732.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.859655737876892, + "sampling/importance_sampling_ratio/mean": 1.000511646270752, + "sampling/importance_sampling_ratio/min": 0.4743329584598541, + "sampling/sampling_logp_difference/max": 0.7458457946777344, + "sampling/sampling_logp_difference/mean": 0.015155967324972153, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 245.5, + "completions/mean_terminated_length": 245.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2327881157398224, + "epoch": 1.0661764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1141680236873273, + "kl": 0.07259321212768555, + "learning_rate": 8.156280561501194e-07, + "loss": 0.0071, + "num_tokens": 27581308.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6321566104888916, + "sampling/importance_sampling_ratio/mean": 1.00043523311615, + "sampling/importance_sampling_ratio/min": 0.3766418993473053, + "sampling/sampling_logp_difference/max": 0.9764604568481445, + "sampling/sampling_logp_difference/mean": 0.014734484255313873, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 144.84375, + "completions/mean_terminated_length": 144.84375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.21607786417007446, + "epoch": 1.0674019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09943714136351682, + "kl": 0.09778988361358643, + "learning_rate": 8.150752328143513e-07, + "loss": 0.001, + "num_tokens": 27611090.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7520544528961182, + "sampling/importance_sampling_ratio/mean": 0.9992098808288574, + "sampling/importance_sampling_ratio/min": 0.4616309702396393, + "sampling/sampling_logp_difference/max": 0.7729895114898682, + "sampling/sampling_logp_difference/mean": 0.01589813269674778, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 189.90625, + "completions/mean_terminated_length": 189.90625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.30563634634017944, + "epoch": 1.0686274509803921, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1816561649436643, + "kl": 0.13414150476455688, + "learning_rate": 8.145217698958211e-07, + "loss": 0.0097, + "num_tokens": 27640380.0, + "reward": -0.28125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004225969314575, + "sampling/importance_sampling_ratio/min": 0.46316248178482056, + "sampling/sampling_logp_difference/max": 1.0734158754348755, + "sampling/sampling_logp_difference/mean": 0.01720433682203293, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 173.71875, + "completions/mean_terminated_length": 173.71875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2072143256664276, + "epoch": 1.0698529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08247576173754775, + "kl": 0.07004286348819733, + "learning_rate": 8.139676685180236e-07, + "loss": 0.0007, + "num_tokens": 27668090.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7890275716781616, + "sampling/importance_sampling_ratio/mean": 0.9991549849510193, + "sampling/importance_sampling_ratio/min": 0.3797149658203125, + "sampling/sampling_logp_difference/max": 0.968334436416626, + "sampling/sampling_logp_difference/mean": 0.016450336202979088, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 171.5, + "completions/mean_terminated_length": 171.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.18553629517555237, + "epoch": 1.071078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5123373003069194, + "kl": 0.07597959041595459, + "learning_rate": 8.134129298057495e-07, + "loss": 0.0157, + "num_tokens": 27696906.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9365143775939941, + "sampling/importance_sampling_ratio/mean": 0.9994390606880188, + "sampling/importance_sampling_ratio/min": 0.3349955677986145, + "sampling/sampling_logp_difference/max": 1.0936380624771118, + "sampling/sampling_logp_difference/mean": 0.014272423461079597, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 148.875, + "completions/mean_terminated_length": 148.875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.21442510187625885, + "epoch": 1.0723039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1909442243640793, + "kl": 0.09609478712081909, + "learning_rate": 8.128575548850832e-07, + "loss": 0.0047, + "num_tokens": 27721922.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4697335958480835, + "sampling/importance_sampling_ratio/mean": 0.9998018741607666, + "sampling/importance_sampling_ratio/min": 0.5075057744979858, + "sampling/sampling_logp_difference/max": 0.6782472133636475, + "sampling/sampling_logp_difference/mean": 0.012937607243657112, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 169.75, + "completions/mean_terminated_length": 169.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2670651078224182, + "epoch": 1.0735294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.465152893859862, + "kl": 0.11509145051240921, + "learning_rate": 8.123015448834005e-07, + "loss": 0.0165, + "num_tokens": 27751970.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000338554382324, + "sampling/importance_sampling_ratio/min": 0.2617838978767395, + "sampling/sampling_logp_difference/max": 1.340235948562622, + "sampling/sampling_logp_difference/mean": 0.016946731135249138, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 156.75, + "completions/mean_terminated_length": 156.75, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2190035581588745, + "epoch": 1.0747549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7814630699701175, + "kl": 0.08700970560312271, + "learning_rate": 8.117449009293668e-07, + "loss": -0.0111, + "num_tokens": 27777058.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00069260597229, + "sampling/importance_sampling_ratio/min": 0.5145741701126099, + "sampling/sampling_logp_difference/max": 0.8772447109222412, + "sampling/sampling_logp_difference/mean": 0.01563248410820961, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 167.140625, + "completions/mean_terminated_length": 167.140625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.21881946921348572, + "epoch": 1.0759803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09202604581234461, + "kl": 0.09001221507787704, + "learning_rate": 8.111876241529337e-07, + "loss": 0.0009, + "num_tokens": 27805371.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.655460000038147, + "sampling/importance_sampling_ratio/mean": 0.9996316432952881, + "sampling/importance_sampling_ratio/min": 0.5658692121505737, + "sampling/sampling_logp_difference/max": 0.569392204284668, + "sampling/sampling_logp_difference/mean": 0.014748353511095047, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 185.453125, + "completions/mean_terminated_length": 185.453125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2720149755477905, + "epoch": 1.0772058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0505209479500186, + "kl": 0.09834681451320648, + "learning_rate": 8.106297156853379e-07, + "loss": 0.0001, + "num_tokens": 27832936.0, + "reward": 0.15625, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.6116071343421936, + "sampling/sampling_logp_difference/max": 0.7369050979614258, + "sampling/sampling_logp_difference/mean": 0.015585452318191528, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 170.46875, + "completions/mean_terminated_length": 170.46875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.20675668120384216, + "epoch": 1.0784313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1771199407375331, + "kl": 0.08929380774497986, + "learning_rate": 8.100711766590982e-07, + "loss": 0.0009, + "num_tokens": 27862198.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999538779258728, + "sampling/importance_sampling_ratio/min": 0.03980950638651848, + "sampling/sampling_logp_difference/max": 3.223649501800537, + "sampling/sampling_logp_difference/mean": 0.015134681947529316, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 146.078125, + "completions/mean_terminated_length": 146.078125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.18133658170700073, + "epoch": 1.079656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05583044053764275, + "kl": 0.07593855261802673, + "learning_rate": 8.095120082080134e-07, + "loss": 0.0008, + "num_tokens": 27887195.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9179813861846924, + "sampling/importance_sampling_ratio/mean": 0.9999933242797852, + "sampling/importance_sampling_ratio/min": 0.498348593711853, + "sampling/sampling_logp_difference/max": 0.6964554786682129, + "sampling/sampling_logp_difference/mean": 0.013764860108494759, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 164.5625, + "completions/mean_terminated_length": 164.5625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.24878941476345062, + "epoch": 1.0808823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09908678307571109, + "kl": 0.08171862363815308, + "learning_rate": 8.089522114671602e-07, + "loss": 0.0008, + "num_tokens": 27919455.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7992045879364014, + "sampling/importance_sampling_ratio/mean": 0.9997904300689697, + "sampling/importance_sampling_ratio/min": 0.18465721607208252, + "sampling/sampling_logp_difference/max": 1.6892540454864502, + "sampling/sampling_logp_difference/mean": 0.017147742211818695, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 166.234375, + "completions/mean_terminated_length": 166.234375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.20635822415351868, + "epoch": 1.0821078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0998980624978608, + "kl": 0.058448854833841324, + "learning_rate": 8.083917875728905e-07, + "loss": 0.0006, + "num_tokens": 27949966.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00088632106781, + "sampling/importance_sampling_ratio/min": 0.42102333903312683, + "sampling/sampling_logp_difference/max": 0.8650670051574707, + "sampling/sampling_logp_difference/mean": 0.014977429062128067, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 193.109375, + "completions/mean_terminated_length": 193.109375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2436329424381256, + "epoch": 1.0833333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1655872943824683, + "kl": 0.177456796169281, + "learning_rate": 8.07830737662829e-07, + "loss": -0.0015, + "num_tokens": 27981413.0, + "reward": -0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6812682151794434, + "sampling/importance_sampling_ratio/mean": 1.0005836486816406, + "sampling/importance_sampling_ratio/min": 0.5926125645637512, + "sampling/sampling_logp_difference/max": 0.5232144594192505, + "sampling/sampling_logp_difference/mean": 0.014726238325238228, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 125.90625, + "completions/mean_terminated_length": 125.90625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.18380919098854065, + "epoch": 1.0845588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08432329577655735, + "kl": 0.0646153911948204, + "learning_rate": 8.072690628758721e-07, + "loss": 0.0006, + "num_tokens": 28007359.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992244243621826, + "sampling/importance_sampling_ratio/min": 0.5326099395751953, + "sampling/sampling_logp_difference/max": 1.0724267959594727, + "sampling/sampling_logp_difference/mean": 0.014375309459865093, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 173.046875, + "completions/mean_terminated_length": 173.046875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.22392170131206512, + "epoch": 1.0857843137254901, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3293725544143347, + "kl": 0.07281274348497391, + "learning_rate": 8.067067643521833e-07, + "loss": -0.0065, + "num_tokens": 28035122.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6496044397354126, + "sampling/importance_sampling_ratio/mean": 0.9999271035194397, + "sampling/importance_sampling_ratio/min": 0.5685496926307678, + "sampling/sampling_logp_difference/max": 0.5646665692329407, + "sampling/sampling_logp_difference/mean": 0.014200421050190926, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.25153499841690063, + "epoch": 1.0870098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06734149153866195, + "kl": 0.06395354866981506, + "learning_rate": 8.061438432331934e-07, + "loss": 0.0006, + "num_tokens": 28068754.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001474618911743, + "sampling/importance_sampling_ratio/min": 0.416825532913208, + "sampling/sampling_logp_difference/max": 1.0578886270523071, + "sampling/sampling_logp_difference/mean": 0.015244759619235992, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 198.59375, + "completions/mean_terminated_length": 198.59375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.23634985089302063, + "epoch": 1.088235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4979627460277762, + "kl": 0.10542238503694534, + "learning_rate": 8.055803006615965e-07, + "loss": 0.004, + "num_tokens": 28096424.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6498923301696777, + "sampling/importance_sampling_ratio/mean": 1.0002491474151611, + "sampling/importance_sampling_ratio/min": 0.07516255229711533, + "sampling/sampling_logp_difference/max": 2.588102102279663, + "sampling/sampling_logp_difference/mean": 0.013747122138738632, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 137.328125, + "completions/mean_terminated_length": 137.328125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.1915649175643921, + "epoch": 1.0894607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.35661133762129155, + "kl": 0.08230677992105484, + "learning_rate": 8.050161377813485e-07, + "loss": 0.0008, + "num_tokens": 28123997.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6965253353118896, + "sampling/importance_sampling_ratio/mean": 1.0004847049713135, + "sampling/importance_sampling_ratio/min": 0.5760728120803833, + "sampling/sampling_logp_difference/max": 0.5515211820602417, + "sampling/sampling_logp_difference/mean": 0.013545336201786995, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 132.84375, + "completions/mean_terminated_length": 132.84375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.2161245346069336, + "epoch": 1.0906862745098038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20850140464210487, + "kl": 0.11967165768146515, + "learning_rate": 8.04451355737664e-07, + "loss": 0.0012, + "num_tokens": 28148515.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000712871551514, + "sampling/importance_sampling_ratio/min": 0.26496079564094543, + "sampling/sampling_logp_difference/max": 1.3281733989715576, + "sampling/sampling_logp_difference/mean": 0.013996437191963196, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 151.109375, + "completions/mean_terminated_length": 151.109375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.2761315107345581, + "epoch": 1.0919117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6413452606663361, + "kl": 0.0643676295876503, + "learning_rate": 8.03885955677015e-07, + "loss": 0.0055, + "num_tokens": 28180666.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7586278915405273, + "sampling/importance_sampling_ratio/mean": 0.9988889694213867, + "sampling/importance_sampling_ratio/min": 0.4414573311805725, + "sampling/sampling_logp_difference/max": 0.817673921585083, + "sampling/sampling_logp_difference/mean": 0.018916862085461617, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 188.90625, + "completions/mean_terminated_length": 188.90625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.22008417546749115, + "epoch": 1.093137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3192490633186968, + "kl": 0.09187553077936172, + "learning_rate": 8.033199387471276e-07, + "loss": 0.0106, + "num_tokens": 28221972.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006036758422852, + "sampling/importance_sampling_ratio/min": 0.4824557304382324, + "sampling/sampling_logp_difference/max": 0.7369165420532227, + "sampling/sampling_logp_difference/mean": 0.015000857412815094, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 202.53125, + "completions/mean_terminated_length": 202.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.19605609774589539, + "epoch": 1.094362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.609952827164341, + "kl": 0.06165199726819992, + "learning_rate": 8.027533060969806e-07, + "loss": 0.0367, + "num_tokens": 28255702.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003710985183716, + "sampling/importance_sampling_ratio/min": 0.32678836584091187, + "sampling/sampling_logp_difference/max": 1.1184425354003906, + "sampling/sampling_logp_difference/mean": 0.013808038085699081, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 175.234375, + "completions/mean_terminated_length": 175.234375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.21981266140937805, + "epoch": 1.0955882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14068715017450223, + "kl": 0.09595729410648346, + "learning_rate": 8.021860588768021e-07, + "loss": 0.0009, + "num_tokens": 28281957.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7782487869262695, + "sampling/importance_sampling_ratio/mean": 1.0002150535583496, + "sampling/importance_sampling_ratio/min": 0.567199170589447, + "sampling/sampling_logp_difference/max": 0.5756289958953857, + "sampling/sampling_logp_difference/mean": 0.014246910810470581, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 189.3125, + "completions/mean_terminated_length": 189.3125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.268449068069458, + "epoch": 1.0968137254901962, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6400814076191048, + "kl": 0.09315968304872513, + "learning_rate": 8.016181982380681e-07, + "loss": -0.0292, + "num_tokens": 28311689.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.9858661890029907, + "sampling/importance_sampling_ratio/mean": 1.0002015829086304, + "sampling/importance_sampling_ratio/min": 0.5013279318809509, + "sampling/sampling_logp_difference/max": 0.6904948353767395, + "sampling/sampling_logp_difference/mean": 0.016102006658911705, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 113.84375, + "completions/mean_terminated_length": 113.84375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.15809348225593567, + "epoch": 1.0980392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09564743092595664, + "kl": 0.07759271562099457, + "learning_rate": 8.010497253335e-07, + "loss": 0.0008, + "num_tokens": 28334575.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.578310489654541, + "sampling/importance_sampling_ratio/mean": 0.9996614456176758, + "sampling/importance_sampling_ratio/min": 0.551412045955658, + "sampling/sampling_logp_difference/max": 0.5952730178833008, + "sampling/sampling_logp_difference/mean": 0.013403412885963917, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 172.15625, + "completions/mean_terminated_length": 172.15625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2104586362838745, + "epoch": 1.099264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08695914435531384, + "kl": 0.09293670952320099, + "learning_rate": 8.004806413170612e-07, + "loss": 0.0008, + "num_tokens": 28362585.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001728534698486, + "sampling/importance_sampling_ratio/min": 0.4738742709159851, + "sampling/sampling_logp_difference/max": 0.7468132972717285, + "sampling/sampling_logp_difference/mean": 0.013741472736001015, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 199.0, + "completions/mean_terminated_length": 199.0, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.279516339302063, + "epoch": 1.1004901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1899765711186496, + "kl": 0.10613664984703064, + "learning_rate": 7.999109473439569e-07, + "loss": -0.0026, + "num_tokens": 28392073.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6462812423706055, + "sampling/importance_sampling_ratio/mean": 0.9996769428253174, + "sampling/importance_sampling_ratio/min": 0.25184938311576843, + "sampling/sampling_logp_difference/max": 1.3789241313934326, + "sampling/sampling_logp_difference/mean": 0.016572657972574234, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 146.484375, + "completions/mean_terminated_length": 146.484375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.17928224802017212, + "epoch": 1.1017156862745099, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6791457481155367, + "kl": 0.08268716931343079, + "learning_rate": 7.993406445706292e-07, + "loss": 0.0308, + "num_tokens": 28420568.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997105598449707, + "sampling/importance_sampling_ratio/min": 0.3454512059688568, + "sampling/sampling_logp_difference/max": 1.062903881072998, + "sampling/sampling_logp_difference/mean": 0.013035709038376808, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 142.625, + "completions/mean_terminated_length": 142.625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.21753069758415222, + "epoch": 1.1029411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8405227516058766, + "kl": 0.061972249299287796, + "learning_rate": 7.987697341547568e-07, + "loss": 0.0221, + "num_tokens": 28443344.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6958839893341064, + "sampling/importance_sampling_ratio/mean": 1.0006866455078125, + "sampling/importance_sampling_ratio/min": 0.6069773435592651, + "sampling/sampling_logp_difference/max": 0.528204083442688, + "sampling/sampling_logp_difference/mean": 0.014490251429378986, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 167.203125, + "completions/mean_terminated_length": 167.203125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.23172199726104736, + "epoch": 1.1041666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.084201416307403, + "kl": 0.11401450634002686, + "learning_rate": 7.981982172552517e-07, + "loss": -0.0055, + "num_tokens": 28472525.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6166857481002808, + "sampling/importance_sampling_ratio/mean": 0.9994915127754211, + "sampling/importance_sampling_ratio/min": 0.47824835777282715, + "sampling/sampling_logp_difference/max": 0.7376251220703125, + "sampling/sampling_logp_difference/mean": 0.015587793663144112, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 142.96875, + "completions/mean_terminated_length": 142.96875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.14093346893787384, + "epoch": 1.1053921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0901951837919706, + "kl": 0.06298256665468216, + "learning_rate": 7.976260950322571e-07, + "loss": -0.018, + "num_tokens": 28496107.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0012966394424438, + "sampling/importance_sampling_ratio/min": 0.28800708055496216, + "sampling/sampling_logp_difference/max": 1.2447702884674072, + "sampling/sampling_logp_difference/mean": 0.013270031660795212, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 185.25, + "completions/mean_terminated_length": 185.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.22646483778953552, + "epoch": 1.1066176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3118736590667104, + "kl": 0.061849504709243774, + "learning_rate": 7.970533686471448e-07, + "loss": -0.0118, + "num_tokens": 28532011.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000143051147461, + "sampling/importance_sampling_ratio/min": 0.4926973581314087, + "sampling/sampling_logp_difference/max": 0.9254157543182373, + "sampling/sampling_logp_difference/mean": 0.014329024590551853, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 173.78125, + "completions/mean_terminated_length": 173.78125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.18403930962085724, + "epoch": 1.107843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08850598821881436, + "kl": 0.061722591519355774, + "learning_rate": 7.964800392625128e-07, + "loss": 0.0006, + "num_tokens": 28561373.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.527761459350586, + "sampling/importance_sampling_ratio/mean": 0.9997848272323608, + "sampling/importance_sampling_ratio/min": 0.2004726231098175, + "sampling/sampling_logp_difference/max": 1.6070775985717773, + "sampling/sampling_logp_difference/mean": 0.013661077246069908, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 166.78125, + "completions/mean_terminated_length": 166.78125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.17432111501693726, + "epoch": 1.1090686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07721996234452343, + "kl": 0.07478277385234833, + "learning_rate": 7.959061080421838e-07, + "loss": 0.0007, + "num_tokens": 28590415.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009045600891113, + "sampling/importance_sampling_ratio/min": 0.5906212329864502, + "sampling/sampling_logp_difference/max": 0.9104375839233398, + "sampling/sampling_logp_difference/mean": 0.013019047677516937, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 149.546875, + "completions/mean_terminated_length": 149.546875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.2405448853969574, + "epoch": 1.1102941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11768022147162954, + "kl": 0.09011099487543106, + "learning_rate": 7.953315761512017e-07, + "loss": 0.0008, + "num_tokens": 28616482.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6234381198883057, + "sampling/importance_sampling_ratio/mean": 0.999937117099762, + "sampling/importance_sampling_ratio/min": 0.5206286907196045, + "sampling/sampling_logp_difference/max": 0.652718186378479, + "sampling/sampling_logp_difference/mean": 0.01653948426246643, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 175.625, + "completions/mean_terminated_length": 175.625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.24027252197265625, + "epoch": 1.1115196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048770372300378353, + "kl": 0.09848096966743469, + "learning_rate": 7.947564447558299e-07, + "loss": 0.0009, + "num_tokens": 28643450.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5679230690002441, + "sampling/importance_sampling_ratio/mean": 0.9994180202484131, + "sampling/importance_sampling_ratio/min": 0.3743257522583008, + "sampling/sampling_logp_difference/max": 0.9826288223266602, + "sampling/sampling_logp_difference/mean": 0.014152075164020061, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 190.59375, + "completions/mean_terminated_length": 190.59375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.21170996129512787, + "epoch": 1.1127450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057692996553791565, + "kl": 0.06437669694423676, + "learning_rate": 7.941807150235485e-07, + "loss": 0.0006, + "num_tokens": 28676832.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.697697401046753, + "sampling/importance_sampling_ratio/mean": 0.9997852444648743, + "sampling/importance_sampling_ratio/min": 0.5717601180076599, + "sampling/sampling_logp_difference/max": 0.5590357780456543, + "sampling/sampling_logp_difference/mean": 0.01418859139084816, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 214.28125, + "completions/mean_terminated_length": 214.28125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.23234930634498596, + "epoch": 1.1139705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055265118978672645, + "kl": 0.08269356191158295, + "learning_rate": 7.936043881230525e-07, + "loss": 0.0008, + "num_tokens": 28709346.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997777938842773, + "sampling/importance_sampling_ratio/min": 0.447732537984848, + "sampling/sampling_logp_difference/max": 1.0789175033569336, + "sampling/sampling_logp_difference/mean": 0.0158439502120018, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 166.625, + "completions/mean_terminated_length": 166.625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3130796253681183, + "epoch": 1.1151960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.400752969110404, + "kl": 0.11039891093969345, + "learning_rate": 7.930274652242491e-07, + "loss": 0.0088, + "num_tokens": 28736522.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6554890871047974, + "sampling/importance_sampling_ratio/mean": 0.9998528361320496, + "sampling/importance_sampling_ratio/min": 0.6171384453773499, + "sampling/sampling_logp_difference/max": 0.504096508026123, + "sampling/sampling_logp_difference/mean": 0.01720406860113144, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 206.96875, + "completions/mean_terminated_length": 206.96875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.24904829263687134, + "epoch": 1.116421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.666155549267269, + "kl": 0.0964755266904831, + "learning_rate": 7.924499474982551e-07, + "loss": 0.011, + "num_tokens": 28774504.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.8129268884658813, + "sampling/importance_sampling_ratio/mean": 0.9995471239089966, + "sampling/importance_sampling_ratio/min": 0.2583095133304596, + "sampling/sampling_logp_difference/max": 1.3535966873168945, + "sampling/sampling_logp_difference/mean": 0.01603316329419613, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 184.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.25998538732528687, + "epoch": 1.1176470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.328124972758362, + "kl": 0.08714132755994797, + "learning_rate": 7.91871836117395e-07, + "loss": 0.0159, + "num_tokens": 28800528.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000813007354736, + "sampling/importance_sampling_ratio/min": 0.52607262134552, + "sampling/sampling_logp_difference/max": 0.9863278865814209, + "sampling/sampling_logp_difference/mean": 0.01717241108417511, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 186.59375, + "completions/mean_terminated_length": 186.59375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2034502923488617, + "epoch": 1.1188725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04940559681708589, + "kl": 0.07114953547716141, + "learning_rate": 7.91293132255198e-07, + "loss": 0.0007, + "num_tokens": 28833446.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.645533800125122, + "sampling/importance_sampling_ratio/mean": 0.9995165467262268, + "sampling/importance_sampling_ratio/min": 0.6127004623413086, + "sampling/sampling_logp_difference/max": 0.4980647563934326, + "sampling/sampling_logp_difference/mean": 0.012644816190004349, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 190.296875, + "completions/mean_terminated_length": 190.296875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.24596183001995087, + "epoch": 1.1200980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5655704185905628, + "kl": 0.06891372799873352, + "learning_rate": 7.907138370863967e-07, + "loss": -0.0189, + "num_tokens": 28863177.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6278371810913086, + "sampling/importance_sampling_ratio/mean": 0.9996863007545471, + "sampling/importance_sampling_ratio/min": 0.4309251606464386, + "sampling/sampling_logp_difference/max": 0.8418208360671997, + "sampling/sampling_logp_difference/mean": 0.014467434026300907, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 213.203125, + "completions/mean_terminated_length": 213.203125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.1680026650428772, + "epoch": 1.1213235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.107844507143739, + "kl": 0.062312401831150055, + "learning_rate": 7.901339517869232e-07, + "loss": -0.0027, + "num_tokens": 28897062.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.7520533800125122, + "sampling/importance_sampling_ratio/mean": 0.9992839694023132, + "sampling/importance_sampling_ratio/min": 0.1829947978258133, + "sampling/sampling_logp_difference/max": 1.6982975006103516, + "sampling/sampling_logp_difference/mean": 0.01189229916781187, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 124.671875, + "completions/mean_terminated_length": 124.671875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.20145872235298157, + "epoch": 1.1225490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0697433020845855, + "kl": 0.07393026351928711, + "learning_rate": 7.895534775339083e-07, + "loss": 0.0008, + "num_tokens": 28925697.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6738722324371338, + "sampling/importance_sampling_ratio/mean": 0.9997876286506653, + "sampling/importance_sampling_ratio/min": 0.42287662625312805, + "sampling/sampling_logp_difference/max": 0.8606748580932617, + "sampling/sampling_logp_difference/mean": 0.014831296168267727, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 188.28125, + "completions/mean_terminated_length": 188.28125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2790355980396271, + "epoch": 1.1237745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6326954238160873, + "kl": 0.09928452968597412, + "learning_rate": 7.889724155056776e-07, + "loss": -0.0389, + "num_tokens": 28965555.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000627040863037, + "sampling/importance_sampling_ratio/min": 0.29445353150367737, + "sampling/sampling_logp_difference/max": 1.2226340770721436, + "sampling/sampling_logp_difference/mean": 0.017847351729869843, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 236.8125, + "completions/mean_terminated_length": 236.8125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2594095766544342, + "epoch": 1.125, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.361891349827105, + "kl": 0.090843565762043, + "learning_rate": 7.883907668817506e-07, + "loss": -0.004, + "num_tokens": 29000967.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6158757209777832, + "sampling/importance_sampling_ratio/mean": 1.0001858472824097, + "sampling/importance_sampling_ratio/min": 0.5726673603057861, + "sampling/sampling_logp_difference/max": 0.5574502944946289, + "sampling/sampling_logp_difference/mean": 0.01475977711379528, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 263.03125, + "completions/mean_terminated_length": 263.03125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.27755606174468994, + "epoch": 1.1262254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5527147258963039, + "kl": 0.10148150473833084, + "learning_rate": 7.878085328428368e-07, + "loss": -0.0231, + "num_tokens": 29034137.0, + "reward": 0.21875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000124216079712, + "sampling/importance_sampling_ratio/min": 0.4954375624656677, + "sampling/sampling_logp_difference/max": 0.7390586137771606, + "sampling/sampling_logp_difference/mean": 0.014863025397062302, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 229.78125, + "completions/mean_terminated_length": 229.78125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2748275399208069, + "epoch": 1.1274509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8659229034113305, + "kl": 0.06278257071971893, + "learning_rate": 7.872257145708345e-07, + "loss": 0.0131, + "num_tokens": 29070443.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5545119047164917, + "sampling/importance_sampling_ratio/mean": 0.9995891451835632, + "sampling/importance_sampling_ratio/min": 0.3962823748588562, + "sampling/sampling_logp_difference/max": 0.9256283044815063, + "sampling/sampling_logp_difference/mean": 0.016074594110250473, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 249.0625, + "completions/mean_terminated_length": 249.0625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.1775428056716919, + "epoch": 1.1286764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2413018919858242, + "kl": 0.06724140048027039, + "learning_rate": 7.86642313248828e-07, + "loss": 0.0006, + "num_tokens": 29102079.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8754522800445557, + "sampling/importance_sampling_ratio/mean": 0.9997156262397766, + "sampling/importance_sampling_ratio/min": 0.3408900499343872, + "sampling/sampling_logp_difference/max": 1.076195240020752, + "sampling/sampling_logp_difference/mean": 0.012227769941091537, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3411758840084076, + "epoch": 1.1299019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6798456484998654, + "kl": 0.10103578865528107, + "learning_rate": 7.860583300610847e-07, + "loss": -0.0457, + "num_tokens": 29143757.0, + "reward": 0.34375, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6482800245285034, + "sampling/importance_sampling_ratio/mean": 1.0002042055130005, + "sampling/importance_sampling_ratio/min": 0.6066067218780518, + "sampling/sampling_logp_difference/max": 0.4998745918273926, + "sampling/sampling_logp_difference/mean": 0.017368076369166374, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 223.375, + "completions/mean_terminated_length": 223.375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.2883768081665039, + "epoch": 1.1311274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04460912249120463, + "kl": 0.06478792428970337, + "learning_rate": 7.854737661930539e-07, + "loss": 0.0006, + "num_tokens": 29173413.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7809442281723022, + "sampling/importance_sampling_ratio/mean": 0.9995173811912537, + "sampling/importance_sampling_ratio/min": 0.2171446681022644, + "sampling/sampling_logp_difference/max": 1.5271915197372437, + "sampling/sampling_logp_difference/mean": 0.015471132472157478, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 242.90625, + "completions/mean_terminated_length": 242.90625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2667590379714966, + "epoch": 1.1323529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4411946643889548, + "kl": 0.09007437527179718, + "learning_rate": 7.848886228313632e-07, + "loss": 0.0905, + "num_tokens": 29208687.0, + "reward": 0.71875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.6544312238693237, + "sampling/importance_sampling_ratio/mean": 0.9998416304588318, + "sampling/importance_sampling_ratio/min": 0.5485104322433472, + "sampling/sampling_logp_difference/max": 0.6005489826202393, + "sampling/sampling_logp_difference/mean": 0.014343110844492912, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 172.328125, + "completions/mean_terminated_length": 172.328125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.18077805638313293, + "epoch": 1.133578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0403236538211983, + "kl": 0.04739869385957718, + "learning_rate": 7.843029011638162e-07, + "loss": 0.0005, + "num_tokens": 29232884.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002453327178955, + "sampling/importance_sampling_ratio/min": 0.48107123374938965, + "sampling/sampling_logp_difference/max": 0.746837854385376, + "sampling/sampling_logp_difference/mean": 0.011145679280161858, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 227.3125, + "completions/mean_terminated_length": 227.3125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.22841443121433258, + "epoch": 1.1348039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9636693012874294, + "kl": 0.07613380253314972, + "learning_rate": 7.837166023793908e-07, + "loss": -0.0178, + "num_tokens": 29266520.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.75249445438385, + "sampling/importance_sampling_ratio/mean": 1.000523328781128, + "sampling/importance_sampling_ratio/min": 0.5525768399238586, + "sampling/sampling_logp_difference/max": 0.5931627750396729, + "sampling/sampling_logp_difference/mean": 0.012801182456314564, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 155.28125, + "completions/mean_terminated_length": 155.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.23076486587524414, + "epoch": 1.1360294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8722361970346875, + "kl": 0.10799363255500793, + "learning_rate": 7.831297276682368e-07, + "loss": 0.033, + "num_tokens": 29290570.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.8826043605804443, + "sampling/importance_sampling_ratio/mean": 1.0002775192260742, + "sampling/importance_sampling_ratio/min": 0.4824259579181671, + "sampling/sampling_logp_difference/max": 0.7289278507232666, + "sampling/sampling_logp_difference/mean": 0.01528744027018547, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 170.953125, + "completions/mean_terminated_length": 170.953125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.2521914541721344, + "epoch": 1.1372549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05808780249728172, + "kl": 0.10633736848831177, + "learning_rate": 7.825422782216724e-07, + "loss": 0.0011, + "num_tokens": 29321895.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4849331378936768, + "sampling/importance_sampling_ratio/mean": 0.9999880194664001, + "sampling/importance_sampling_ratio/min": 0.6073311567306519, + "sampling/sampling_logp_difference/max": 0.49868106842041016, + "sampling/sampling_logp_difference/mean": 0.014466575346887112, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 190.828125, + "completions/mean_terminated_length": 190.828125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.23662221431732178, + "epoch": 1.1384803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6294933536697809, + "kl": 0.05445215106010437, + "learning_rate": 7.819542552321827e-07, + "loss": 0.0043, + "num_tokens": 29349644.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.4760836362838745, + "sampling/importance_sampling_ratio/mean": 0.9997459650039673, + "sampling/importance_sampling_ratio/min": 0.5910096168518066, + "sampling/sampling_logp_difference/max": 0.525922954082489, + "sampling/sampling_logp_difference/mean": 0.013352379202842712, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 202.984375, + "completions/mean_terminated_length": 202.984375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.27404844760894775, + "epoch": 1.1397058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2645598118345176, + "kl": 0.07706104218959808, + "learning_rate": 7.813656598934173e-07, + "loss": -0.0056, + "num_tokens": 29379915.0, + "reward": -0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998271465301514, + "sampling/importance_sampling_ratio/min": 0.5051249265670776, + "sampling/sampling_logp_difference/max": 0.7048373222351074, + "sampling/sampling_logp_difference/mean": 0.0161677747964859, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 269.28125, + "completions/mean_terminated_length": 269.28125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.26741868257522583, + "epoch": 1.1409313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9393949610844635, + "kl": 0.07544878125190735, + "learning_rate": 7.807764934001874e-07, + "loss": -0.0045, + "num_tokens": 29413757.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6231114864349365, + "sampling/importance_sampling_ratio/mean": 0.9996011257171631, + "sampling/importance_sampling_ratio/min": 0.23029865324497223, + "sampling/sampling_logp_difference/max": 1.4683783054351807, + "sampling/sampling_logp_difference/mean": 0.015428060665726662, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 197.71875, + "completions/mean_terminated_length": 197.71875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.24409207701683044, + "epoch": 1.142156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07716562395080737, + "kl": 0.09720595180988312, + "learning_rate": 7.801867569484634e-07, + "loss": 0.0009, + "num_tokens": 29448891.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.964284062385559, + "sampling/importance_sampling_ratio/mean": 0.9992693662643433, + "sampling/importance_sampling_ratio/min": 0.3163638114929199, + "sampling/sampling_logp_difference/max": 1.150862455368042, + "sampling/sampling_logp_difference/mean": 0.01469617709517479, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 171.5, + "completions/mean_terminated_length": 171.5, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.26554036140441895, + "epoch": 1.1433823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5312344576887653, + "kl": 0.09911169111728668, + "learning_rate": 7.795964517353733e-07, + "loss": 0.0153, + "num_tokens": 29475595.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6276973485946655, + "sampling/importance_sampling_ratio/mean": 0.9995471239089966, + "sampling/importance_sampling_ratio/min": 0.4482499063014984, + "sampling/sampling_logp_difference/max": 0.8024044036865234, + "sampling/sampling_logp_difference/mean": 0.014082803390920162, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 170.296875, + "completions/mean_terminated_length": 170.296875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.23128408193588257, + "epoch": 1.1446078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.611400382252435, + "kl": 0.09448525309562683, + "learning_rate": 7.790055789591993e-07, + "loss": 0.0169, + "num_tokens": 29502830.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6554914712905884, + "sampling/importance_sampling_ratio/mean": 1.000464677810669, + "sampling/importance_sampling_ratio/min": 0.5213980674743652, + "sampling/sampling_logp_difference/max": 0.6512415409088135, + "sampling/sampling_logp_difference/mean": 0.014646067284047604, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 164.640625, + "completions/mean_terminated_length": 164.640625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.19989417493343353, + "epoch": 1.1458333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043187364235219225, + "kl": 0.06290332973003387, + "learning_rate": 7.784141398193753e-07, + "loss": 0.0006, + "num_tokens": 29537687.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6942651271820068, + "sampling/importance_sampling_ratio/mean": 0.9995973706245422, + "sampling/importance_sampling_ratio/min": 0.6011142730712891, + "sampling/sampling_logp_difference/max": 0.5272490978240967, + "sampling/sampling_logp_difference/mean": 0.012707795947790146, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 238.640625, + "completions/mean_terminated_length": 238.640625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.270319402217865, + "epoch": 1.1470588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0861291354862272, + "kl": 0.11952750384807587, + "learning_rate": 7.778221355164857e-07, + "loss": -0.0327, + "num_tokens": 29579984.0, + "reward": 0.21875, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.868499755859375, + "sampling/importance_sampling_ratio/mean": 1.0005584955215454, + "sampling/importance_sampling_ratio/min": 0.48394113779067993, + "sampling/sampling_logp_difference/max": 0.7257919311523438, + "sampling/sampling_logp_difference/mean": 0.015709731727838516, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 196.109375, + "completions/mean_terminated_length": 196.109375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.21992076933383942, + "epoch": 1.1482843137254901, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5846443807671733, + "kl": 0.0719984918832779, + "learning_rate": 7.772295672522614e-07, + "loss": -0.0043, + "num_tokens": 29611431.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.987679362297058, + "sampling/importance_sampling_ratio/mean": 0.9999401569366455, + "sampling/importance_sampling_ratio/min": 0.26966604590415955, + "sampling/sampling_logp_difference/max": 1.3105709552764893, + "sampling/sampling_logp_difference/mean": 0.013067006133496761, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 159.34375, + "completions/mean_terminated_length": 159.34375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2844783365726471, + "epoch": 1.1495098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.106287045051587, + "kl": 0.14609958231449127, + "learning_rate": 7.766364362295788e-07, + "loss": -0.0294, + "num_tokens": 29640813.0, + "reward": -0.21875, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6640719175338745, + "sampling/importance_sampling_ratio/mean": 0.9998704791069031, + "sampling/importance_sampling_ratio/min": 0.54460608959198, + "sampling/sampling_logp_difference/max": 0.6076924800872803, + "sampling/sampling_logp_difference/mean": 0.017406413331627846, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 184.140625, + "completions/mean_terminated_length": 184.140625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3216135799884796, + "epoch": 1.150735294117647, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7505349015734255, + "kl": 0.14020898938179016, + "learning_rate": 7.760427436524559e-07, + "loss": -0.0575, + "num_tokens": 29671094.0, + "reward": 0.21875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6192717552185059, + "sampling/importance_sampling_ratio/mean": 1.0005667209625244, + "sampling/importance_sampling_ratio/min": 0.6146707534790039, + "sampling/sampling_logp_difference/max": 0.48666858673095703, + "sampling/sampling_logp_difference/mean": 0.01606844738125801, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 183.328125, + "completions/mean_terminated_length": 183.328125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2882741689682007, + "epoch": 1.1519607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2742192952923719, + "kl": 0.11009672284126282, + "learning_rate": 7.754484907260512e-07, + "loss": -0.0043, + "num_tokens": 29700539.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9129751920700073, + "sampling/importance_sampling_ratio/mean": 0.9998451471328735, + "sampling/importance_sampling_ratio/min": 0.5417364239692688, + "sampling/sampling_logp_difference/max": 0.6486597061157227, + "sampling/sampling_logp_difference/mean": 0.016127295792102814, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 235.953125, + "completions/mean_terminated_length": 235.953125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.2792004346847534, + "epoch": 1.153186274509804, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7810826416466288, + "kl": 0.09145475178956985, + "learning_rate": 7.748536786566606e-07, + "loss": 0.0448, + "num_tokens": 29734712.0, + "reward": 0.46875, + "reward_std": 0.5281128883361816, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8409394025802612, + "sampling/importance_sampling_ratio/mean": 0.9997409582138062, + "sampling/importance_sampling_ratio/min": 0.32505807280540466, + "sampling/sampling_logp_difference/max": 1.1237514019012451, + "sampling/sampling_logp_difference/mean": 0.014812508597970009, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 165.6875, + "completions/mean_terminated_length": 165.6875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.31966108083724976, + "epoch": 1.1544117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.281936858636925, + "kl": 0.1147167757153511, + "learning_rate": 7.742583086517149e-07, + "loss": 0.0217, + "num_tokens": 29767604.0, + "reward": 0.6875, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5733104944229126, + "sampling/importance_sampling_ratio/mean": 1.0004137754440308, + "sampling/importance_sampling_ratio/min": 0.5986762642860413, + "sampling/sampling_logp_difference/max": 0.5130343437194824, + "sampling/sampling_logp_difference/mean": 0.015881778672337532, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 239.015625, + "completions/mean_terminated_length": 239.015625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.24411898851394653, + "epoch": 1.155637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1515480831371738, + "kl": 0.06654952466487885, + "learning_rate": 7.736623819197773e-07, + "loss": 0.0136, + "num_tokens": 29800933.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.0002235174179077, + "sampling/importance_sampling_ratio/min": 0.49754151701927185, + "sampling/sampling_logp_difference/max": 0.6980762481689453, + "sampling/sampling_logp_difference/mean": 0.014166567474603653, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 193.046875, + "completions/mean_terminated_length": 193.046875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2028464674949646, + "epoch": 1.156862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15238905596366073, + "kl": 0.07373914867639542, + "learning_rate": 7.730658996705415e-07, + "loss": 0.0007, + "num_tokens": 29832808.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999911785125732, + "sampling/importance_sampling_ratio/min": 0.4362257421016693, + "sampling/sampling_logp_difference/max": 1.1146588325500488, + "sampling/sampling_logp_difference/mean": 0.013756453059613705, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 147.140625, + "completions/mean_terminated_length": 147.140625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.24343600869178772, + "epoch": 1.1580882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640526979620608, + "kl": 0.12458021193742752, + "learning_rate": 7.724688631148286e-07, + "loss": 0.0012, + "num_tokens": 29860337.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7520533800125122, + "sampling/importance_sampling_ratio/mean": 0.9997891187667847, + "sampling/importance_sampling_ratio/min": 0.48723921179771423, + "sampling/sampling_logp_difference/max": 0.7190001010894775, + "sampling/sampling_logp_difference/mean": 0.0150705361738801, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 198.625, + "completions/mean_terminated_length": 198.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.24111589789390564, + "epoch": 1.159313725490196, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.088480544947843, + "kl": 0.09656170010566711, + "learning_rate": 7.718712734645849e-07, + "loss": 0.0145, + "num_tokens": 29891225.0, + "reward": 0.5625, + "reward_std": 0.622555673122406, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.9504865407943726, + "sampling/importance_sampling_ratio/mean": 1.0003337860107422, + "sampling/importance_sampling_ratio/min": 0.4205363392829895, + "sampling/sampling_logp_difference/max": 0.8662244081497192, + "sampling/sampling_logp_difference/mean": 0.014271966181695461, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 164.921875, + "completions/mean_terminated_length": 164.921875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.23615966737270355, + "epoch": 1.1605392156862746, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.360246726436841, + "kl": 0.11994391679763794, + "learning_rate": 7.712731319328797e-07, + "loss": 0.0086, + "num_tokens": 29919316.0, + "reward": -0.1875, + "reward_std": 0.6143567562103271, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.9379637241363525, + "sampling/importance_sampling_ratio/mean": 0.9998552799224854, + "sampling/importance_sampling_ratio/min": 0.2048644721508026, + "sampling/sampling_logp_difference/max": 1.5854065418243408, + "sampling/sampling_logp_difference/mean": 0.014262652024626732, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 135.046875, + "completions/mean_terminated_length": 135.046875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.1943216323852539, + "epoch": 1.161764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0982288019009852, + "kl": 0.09366685152053833, + "learning_rate": 7.706744397339022e-07, + "loss": 0.0009, + "num_tokens": 29943847.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5468405485153198, + "sampling/importance_sampling_ratio/mean": 0.9990905523300171, + "sampling/importance_sampling_ratio/min": 0.3681606352329254, + "sampling/sampling_logp_difference/max": 0.999235987663269, + "sampling/sampling_logp_difference/mean": 0.013385389931499958, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 181.25, + "completions/mean_terminated_length": 181.25, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2914361357688904, + "epoch": 1.1629901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.592449862986648, + "kl": 0.1040593683719635, + "learning_rate": 7.700751980829601e-07, + "loss": 0.0265, + "num_tokens": 29974231.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6463134288787842, + "sampling/importance_sampling_ratio/mean": 1.0004979372024536, + "sampling/importance_sampling_ratio/min": 0.6368659138679504, + "sampling/sampling_logp_difference/max": 0.4985384941101074, + "sampling/sampling_logp_difference/mean": 0.015406950376927853, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 143.359375, + "completions/mean_terminated_length": 143.359375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.21524740755558014, + "epoch": 1.1642156862745099, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.404595430725208, + "kl": 0.09196782857179642, + "learning_rate": 7.694754081964754e-07, + "loss": 0.0017, + "num_tokens": 29998974.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.511399269104004, + "sampling/importance_sampling_ratio/mean": 0.9996278285980225, + "sampling/importance_sampling_ratio/min": 0.048157621175050735, + "sampling/sampling_logp_difference/max": 3.033275842666626, + "sampling/sampling_logp_difference/mean": 0.014019029214978218, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 201.84375, + "completions/mean_terminated_length": 201.84375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.22843119502067566, + "epoch": 1.1654411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.62753929659269, + "kl": 0.08088122308254242, + "learning_rate": 7.688750712919839e-07, + "loss": 0.1308, + "num_tokens": 30033316.0, + "reward": 0.65625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.9675623178482056, + "sampling/importance_sampling_ratio/mean": 0.9993234872817993, + "sampling/importance_sampling_ratio/min": 0.5146050453186035, + "sampling/sampling_logp_difference/max": 0.6767954230308533, + "sampling/sampling_logp_difference/mean": 0.013346588239073753, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 220.203125, + "completions/mean_terminated_length": 220.203125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2933310270309448, + "epoch": 1.1666666666666667, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8790975865395463, + "kl": 0.11788161098957062, + "learning_rate": 7.682741885881314e-07, + "loss": -0.0952, + "num_tokens": 30065745.0, + "reward": 0.40625, + "reward_std": 0.747555673122406, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.7792528867721558, + "sampling/importance_sampling_ratio/mean": 1.0003098249435425, + "sampling/importance_sampling_ratio/min": 0.10682544112205505, + "sampling/sampling_logp_difference/max": 2.2365591526031494, + "sampling/sampling_logp_difference/mean": 0.016651522368192673, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 227.0, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.29392799735069275, + "epoch": 1.1678921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3190970582439598, + "kl": 0.10862173140048981, + "learning_rate": 7.676727613046719e-07, + "loss": 0.0063, + "num_tokens": 30101649.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9622893333435059, + "sampling/importance_sampling_ratio/mean": 0.9999975562095642, + "sampling/importance_sampling_ratio/min": 0.3799148201942444, + "sampling/sampling_logp_difference/max": 0.9678082466125488, + "sampling/sampling_logp_difference/mean": 0.018010687083005905, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 157.4375, + "completions/mean_terminated_length": 157.4375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.23681524395942688, + "epoch": 1.1691176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2859018028114504, + "kl": 0.11465495824813843, + "learning_rate": 7.670707906624643e-07, + "loss": 0.0129, + "num_tokens": 30126461.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994912147521973, + "sampling/importance_sampling_ratio/min": 0.3715115785598755, + "sampling/sampling_logp_difference/max": 0.9901752471923828, + "sampling/sampling_logp_difference/mean": 0.015134238637983799, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 157.875, + "completions/mean_terminated_length": 157.875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.25739586353302, + "epoch": 1.170343137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9051026750783888, + "kl": 0.10601422935724258, + "learning_rate": 7.664682778834712e-07, + "loss": 0.0091, + "num_tokens": 30154149.0, + "reward": 0.0, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9012205600738525, + "sampling/importance_sampling_ratio/mean": 1.0000576972961426, + "sampling/importance_sampling_ratio/min": 0.6222960948944092, + "sampling/sampling_logp_difference/max": 0.6424961090087891, + "sampling/sampling_logp_difference/mean": 0.014406044036149979, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 171.59375, + "completions/mean_terminated_length": 171.59375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.26630714535713196, + "epoch": 1.1715686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2159802984754928, + "kl": 0.11691936105489731, + "learning_rate": 7.658652241907554e-07, + "loss": -0.0028, + "num_tokens": 30179435.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5972579717636108, + "sampling/importance_sampling_ratio/mean": 1.0007264614105225, + "sampling/importance_sampling_ratio/min": 0.5100693702697754, + "sampling/sampling_logp_difference/max": 0.6732085347175598, + "sampling/sampling_logp_difference/mean": 0.01576792448759079, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 181.203125, + "completions/mean_terminated_length": 181.203125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2374798059463501, + "epoch": 1.1727941176470589, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9398123426729155, + "kl": 0.12092936784029007, + "learning_rate": 7.652616308084774e-07, + "loss": -0.0298, + "num_tokens": 30210264.0, + "reward": 0.3125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 0.9997060298919678, + "sampling/importance_sampling_ratio/min": 0.6082713603973389, + "sampling/sampling_logp_difference/max": 0.49713414907455444, + "sampling/sampling_logp_difference/mean": 0.012684464454650879, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 157.703125, + "completions/mean_terminated_length": 157.703125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.19216549396514893, + "epoch": 1.1740196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1590387512992275, + "kl": 0.11101230978965759, + "learning_rate": 7.646574989618937e-07, + "loss": 0.011, + "num_tokens": 30236373.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7858788967132568, + "sampling/importance_sampling_ratio/mean": 1.0005167722702026, + "sampling/importance_sampling_ratio/min": 0.5496814846992493, + "sampling/sampling_logp_difference/max": 0.5984163284301758, + "sampling/sampling_logp_difference/mean": 0.012150926515460014, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 169.328125, + "completions/mean_terminated_length": 169.328125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.22564482688903809, + "epoch": 1.1752450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.209704009097848, + "kl": 0.09982454031705856, + "learning_rate": 7.640528298773536e-07, + "loss": -0.0015, + "num_tokens": 30263402.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6621389389038086, + "sampling/importance_sampling_ratio/mean": 1.0002570152282715, + "sampling/importance_sampling_ratio/min": 0.41285261511802673, + "sampling/sampling_logp_difference/max": 0.8846646547317505, + "sampling/sampling_logp_difference/mean": 0.01376567967236042, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 182.34375, + "completions/mean_terminated_length": 182.34375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.24452808499336243, + "epoch": 1.1764705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1652783005328748, + "kl": 0.12087363749742508, + "learning_rate": 7.634476247822972e-07, + "loss": -0.0011, + "num_tokens": 30291136.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.820923924446106, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.4822184443473816, + "sampling/sampling_logp_difference/max": 0.7293580770492554, + "sampling/sampling_logp_difference/mean": 0.014854757115244865, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 173.0, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.23493146896362305, + "epoch": 1.1776960784313726, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.715701720905447, + "kl": 0.11910027265548706, + "learning_rate": 7.628418849052523e-07, + "loss": -0.0006, + "num_tokens": 30317792.0, + "reward": 0.75, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.7189116477966309, + "sampling/importance_sampling_ratio/mean": 1.0002059936523438, + "sampling/importance_sampling_ratio/min": 0.6273089051246643, + "sampling/sampling_logp_difference/max": 0.5416913032531738, + "sampling/sampling_logp_difference/mean": 0.013392720371484756, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 159.875, + "completions/mean_terminated_length": 159.875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.23814696073532104, + "epoch": 1.178921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1236190687946874, + "kl": 0.10747385770082474, + "learning_rate": 7.622356114758327e-07, + "loss": -0.0233, + "num_tokens": 30344984.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001795291900635, + "sampling/importance_sampling_ratio/min": 0.03683508187532425, + "sampling/sampling_logp_difference/max": 3.301304578781128, + "sampling/sampling_logp_difference/mean": 0.014577718451619148, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 173.796875, + "completions/mean_terminated_length": 173.796875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.25150713324546814, + "epoch": 1.1801470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.207080415526954, + "kl": 0.10294643044471741, + "learning_rate": 7.616288057247349e-07, + "loss": -0.0088, + "num_tokens": 30375211.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6083003282546997, + "sampling/importance_sampling_ratio/mean": 0.9993696212768555, + "sampling/importance_sampling_ratio/min": 0.1920984983444214, + "sampling/sampling_logp_difference/max": 1.6497470140457153, + "sampling/sampling_logp_difference/mean": 0.014811830595135689, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 130.15625, + "completions/mean_terminated_length": 130.15625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.27737027406692505, + "epoch": 1.1813725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060223855735233534, + "kl": 0.11574649065732956, + "learning_rate": 7.610214688837361e-07, + "loss": 0.0011, + "num_tokens": 30410293.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4464823007583618, + "sampling/importance_sampling_ratio/mean": 0.9997907280921936, + "sampling/importance_sampling_ratio/min": 0.6293777227401733, + "sampling/sampling_logp_difference/max": 0.46302366256713867, + "sampling/sampling_logp_difference/mean": 0.014945710077881813, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 147.046875, + "completions/mean_terminated_length": 147.046875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.1855459064245224, + "epoch": 1.1825980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048182256152113835, + "kl": 0.08147725462913513, + "learning_rate": 7.604136021856916e-07, + "loss": 0.0008, + "num_tokens": 30436168.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6007353067398071, + "sampling/importance_sampling_ratio/mean": 1.0000842809677124, + "sampling/importance_sampling_ratio/min": 0.5712746977806091, + "sampling/sampling_logp_difference/max": 0.5598850250244141, + "sampling/sampling_logp_difference/mean": 0.012940846383571625, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 131.890625, + "completions/mean_terminated_length": 131.890625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.23324932157993317, + "epoch": 1.1838235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4133326252559428, + "kl": 0.10719141364097595, + "learning_rate": 7.598052068645324e-07, + "loss": -0.0014, + "num_tokens": 30466737.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6355787515640259, + "sampling/importance_sampling_ratio/mean": 1.0004844665527344, + "sampling/importance_sampling_ratio/min": 0.614501953125, + "sampling/sampling_logp_difference/max": 0.49199676513671875, + "sampling/sampling_logp_difference/mean": 0.014336496591567993, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 155.859375, + "completions/mean_terminated_length": 155.859375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.21427032351493835, + "epoch": 1.1850490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3469113174737197, + "kl": 0.0982939600944519, + "learning_rate": 7.591962841552626e-07, + "loss": 0.0031, + "num_tokens": 30502136.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.563099980354309, + "sampling/importance_sampling_ratio/mean": 1.0004643201828003, + "sampling/importance_sampling_ratio/min": 0.43885937333106995, + "sampling/sampling_logp_difference/max": 0.8235762119293213, + "sampling/sampling_logp_difference/mean": 0.014352173544466496, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 181.484375, + "completions/mean_terminated_length": 181.484375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.24575388431549072, + "epoch": 1.1862745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1527970986647007, + "kl": 0.10954149812459946, + "learning_rate": 7.585868352939562e-07, + "loss": 0.0126, + "num_tokens": 30530631.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7687509059906006, + "sampling/importance_sampling_ratio/mean": 1.0001543760299683, + "sampling/importance_sampling_ratio/min": 0.6259166598320007, + "sampling/sampling_logp_difference/max": 0.5702736377716064, + "sampling/sampling_logp_difference/mean": 0.013726416043937206, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 159.578125, + "completions/mean_terminated_length": 159.578125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.32892221212387085, + "epoch": 1.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0645469121857873, + "kl": 0.10489454120397568, + "learning_rate": 7.579768615177564e-07, + "loss": 0.001, + "num_tokens": 30557644.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.948359489440918, + "sampling/importance_sampling_ratio/mean": 0.9999338388442993, + "sampling/importance_sampling_ratio/min": 0.6127480864524841, + "sampling/sampling_logp_difference/max": 0.6669877767562866, + "sampling/sampling_logp_difference/mean": 0.01787525787949562, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 258.515625, + "completions/mean_terminated_length": 258.515625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.31929582357406616, + "epoch": 1.1887254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7019363401511123, + "kl": 0.10821347683668137, + "learning_rate": 7.57366364064871e-07, + "loss": 0.0424, + "num_tokens": 30594285.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5922117233276367, + "sampling/importance_sampling_ratio/mean": 1.0006930828094482, + "sampling/importance_sampling_ratio/min": 0.5471285581588745, + "sampling/sampling_logp_difference/max": 0.6030714511871338, + "sampling/sampling_logp_difference/mean": 0.015439395792782307, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 181.5625, + "completions/mean_terminated_length": 181.5625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.2947167158126831, + "epoch": 1.1899509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3912095770517858, + "kl": 0.13718779385089874, + "learning_rate": 7.567553441745711e-07, + "loss": -0.0096, + "num_tokens": 30629057.0, + "reward": 0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5678318738937378, + "sampling/importance_sampling_ratio/mean": 0.9998456239700317, + "sampling/importance_sampling_ratio/min": 0.5356789231300354, + "sampling/sampling_logp_difference/max": 0.6242203712463379, + "sampling/sampling_logp_difference/mean": 0.014373021200299263, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 180.140625, + "completions/mean_terminated_length": 180.140625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.30969664454460144, + "epoch": 1.1911764705882353, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7556685828599283, + "kl": 0.11412659287452698, + "learning_rate": 7.561438030871885e-07, + "loss": 0.0002, + "num_tokens": 30657306.0, + "reward": 0.65625, + "reward_std": 0.7015564441680908, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4793007373809814, + "sampling/importance_sampling_ratio/mean": 0.9995293617248535, + "sampling/importance_sampling_ratio/min": 0.5742098093032837, + "sampling/sampling_logp_difference/max": 0.5547604560852051, + "sampling/sampling_logp_difference/mean": 0.015855111181735992, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 218.890625, + "completions/mean_terminated_length": 218.890625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.33018720149993896, + "epoch": 1.1924019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06753584191478308, + "kl": 0.12269055843353271, + "learning_rate": 7.555317420441129e-07, + "loss": 0.0012, + "num_tokens": 30691955.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997884035110474, + "sampling/importance_sampling_ratio/min": 0.5038020610809326, + "sampling/sampling_logp_difference/max": 1.0370073318481445, + "sampling/sampling_logp_difference/mean": 0.01559330802410841, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 206.53125, + "completions/mean_terminated_length": 206.53125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2860816717147827, + "epoch": 1.1936274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.058299976937103, + "kl": 0.0953373983502388, + "learning_rate": 7.549191622877892e-07, + "loss": 0.0152, + "num_tokens": 30723781.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.556614875793457, + "sampling/importance_sampling_ratio/mean": 0.9998517036437988, + "sampling/importance_sampling_ratio/min": 0.6611390113830566, + "sampling/sampling_logp_difference/max": 0.44251346588134766, + "sampling/sampling_logp_difference/mean": 0.014137894846498966, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 252.828125, + "completions/mean_terminated_length": 252.828125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.32496657967567444, + "epoch": 1.1948529411764706, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.551164058632505, + "kl": 0.10039462894201279, + "learning_rate": 7.543060650617158e-07, + "loss": 0.0045, + "num_tokens": 30758474.0, + "reward": 0.0, + "reward_std": 0.6143567562103271, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6483101844787598, + "sampling/importance_sampling_ratio/mean": 0.999916672706604, + "sampling/importance_sampling_ratio/min": 0.4820709228515625, + "sampling/sampling_logp_difference/max": 0.7296640872955322, + "sampling/sampling_logp_difference/mean": 0.01448042131960392, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 181.09375, + "completions/mean_terminated_length": 181.09375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.33883771300315857, + "epoch": 1.196078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0672261850240166, + "kl": 0.1487603634595871, + "learning_rate": 7.53692451610441e-07, + "loss": 0.0177, + "num_tokens": 30789824.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5100367069244385, + "sampling/importance_sampling_ratio/mean": 1.0001670122146606, + "sampling/importance_sampling_ratio/min": 0.37719595432281494, + "sampling/sampling_logp_difference/max": 0.9749904870986938, + "sampling/sampling_logp_difference/mean": 0.016527269035577774, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 224.953125, + "completions/mean_terminated_length": 224.953125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.30383163690567017, + "epoch": 1.1973039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.43357950972283, + "kl": 0.10215874016284943, + "learning_rate": 7.530783231795614e-07, + "loss": -0.065, + "num_tokens": 30821213.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.785274863243103, + "sampling/importance_sampling_ratio/mean": 0.9999973177909851, + "sampling/importance_sampling_ratio/min": 0.6181599497795105, + "sampling/sampling_logp_difference/max": 0.579572319984436, + "sampling/sampling_logp_difference/mean": 0.015095638111233711, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 212.84375, + "completions/mean_terminated_length": 212.84375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2441040575504303, + "epoch": 1.1985294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3686507933610939, + "kl": 0.08347011357545853, + "learning_rate": 7.524636810157188e-07, + "loss": -0.0328, + "num_tokens": 30853107.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8847352266311646, + "sampling/importance_sampling_ratio/mean": 0.999988317489624, + "sampling/importance_sampling_ratio/min": 0.5676478147506714, + "sampling/sampling_logp_difference/max": 0.6337873935699463, + "sampling/sampling_logp_difference/mean": 0.013035210780799389, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 176.046875, + "completions/mean_terminated_length": 176.046875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2641918659210205, + "epoch": 1.1997549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3487198731382968, + "kl": 0.0982346385717392, + "learning_rate": 7.518485263665977e-07, + "loss": -0.0045, + "num_tokens": 30881318.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992034435272217, + "sampling/importance_sampling_ratio/min": 0.49549606442451477, + "sampling/sampling_logp_difference/max": 0.7284934520721436, + "sampling/sampling_logp_difference/mean": 0.017063260078430176, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 228.484375, + "completions/mean_terminated_length": 228.484375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.32671964168548584, + "epoch": 1.2009803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9774460006360761, + "kl": 0.11252576112747192, + "learning_rate": 7.512328604809232e-07, + "loss": 0.0087, + "num_tokens": 30910549.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.558348298072815, + "sampling/importance_sampling_ratio/mean": 0.9993785619735718, + "sampling/importance_sampling_ratio/min": 0.4324365556240082, + "sampling/sampling_logp_difference/max": 0.8383196592330933, + "sampling/sampling_logp_difference/mean": 0.01594432070851326, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 235.65625, + "completions/mean_terminated_length": 235.65625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.300775945186615, + "epoch": 1.2022058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6703505763764348, + "kl": 0.09750582277774811, + "learning_rate": 7.506166846084579e-07, + "loss": -0.0365, + "num_tokens": 30943855.0, + "reward": -0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.649430513381958, + "sampling/importance_sampling_ratio/mean": 1.000500202178955, + "sampling/importance_sampling_ratio/min": 0.5074718594551086, + "sampling/sampling_logp_difference/max": 0.6783139705657959, + "sampling/sampling_logp_difference/mean": 0.015568524599075317, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 275.171875, + "completions/mean_terminated_length": 275.171875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3914821743965149, + "epoch": 1.2034313725490196, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9709409411135073, + "kl": 0.09965787827968597, + "learning_rate": 7.5e-07, + "loss": -0.0209, + "num_tokens": 30991098.0, + "reward": 0.65625, + "reward_std": 0.6223389506340027, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5761357545852661, + "sampling/importance_sampling_ratio/mean": 1.000213861465454, + "sampling/importance_sampling_ratio/min": 0.40983694791793823, + "sampling/sampling_logp_difference/max": 0.891995906829834, + "sampling/sampling_logp_difference/mean": 0.01726599782705307, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 176.625, + "completions/mean_terminated_length": 176.625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.19782614707946777, + "epoch": 1.204656862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.73829266014413, + "kl": 0.1357603371143341, + "learning_rate": 7.493828079073801e-07, + "loss": 0.0341, + "num_tokens": 31015618.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5277687311172485, + "sampling/importance_sampling_ratio/mean": 0.9999630451202393, + "sampling/importance_sampling_ratio/min": 0.4468963146209717, + "sampling/sampling_logp_difference/max": 0.8054287433624268, + "sampling/sampling_logp_difference/mean": 0.014209914021193981, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 168.84375, + "completions/mean_terminated_length": 168.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.22640889883041382, + "epoch": 1.2058823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1397833334244651, + "kl": 0.10726204514503479, + "learning_rate": 7.487651095834588e-07, + "loss": 0.0156, + "num_tokens": 31040888.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.443230152130127, + "sampling/importance_sampling_ratio/mean": 0.9990359544754028, + "sampling/importance_sampling_ratio/min": 0.6210910677909851, + "sampling/sampling_logp_difference/max": 0.47627758979797363, + "sampling/sampling_logp_difference/mean": 0.012890784069895744, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 249.34375, + "completions/mean_terminated_length": 249.34375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.4012042284011841, + "epoch": 1.2071078431372548, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2776803800898437, + "kl": 0.12288743257522583, + "learning_rate": 7.481469062821251e-07, + "loss": -0.0047, + "num_tokens": 31074350.0, + "reward": 0.15625, + "reward_std": 0.769389271736145, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.674807071685791, + "sampling/importance_sampling_ratio/mean": 0.9999292492866516, + "sampling/importance_sampling_ratio/min": 0.5288627743721008, + "sampling/sampling_logp_difference/max": 0.637026309967041, + "sampling/sampling_logp_difference/mean": 0.016513977199792862, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 221.84375, + "completions/mean_terminated_length": 221.84375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3115932047367096, + "epoch": 1.2083333333333333, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6245114122560547, + "kl": 0.11574351787567139, + "learning_rate": 7.47528199258292e-07, + "loss": 0.0316, + "num_tokens": 31107060.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5633171796798706, + "sampling/importance_sampling_ratio/mean": 1.000051736831665, + "sampling/importance_sampling_ratio/min": 0.5911699533462524, + "sampling/sampling_logp_difference/max": 0.525651752948761, + "sampling/sampling_logp_difference/mean": 0.015016937628388405, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 169.859375, + "completions/mean_terminated_length": 169.859375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.24668557941913605, + "epoch": 1.2095588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.136974081092079, + "kl": 0.12169291079044342, + "learning_rate": 7.469089897678957e-07, + "loss": -0.0077, + "num_tokens": 31131163.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5935778617858887, + "sampling/importance_sampling_ratio/mean": 0.9997788667678833, + "sampling/importance_sampling_ratio/min": 0.48457133769989014, + "sampling/sampling_logp_difference/max": 0.7244906425476074, + "sampling/sampling_logp_difference/mean": 0.012976177036762238, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 179.390625, + "completions/mean_terminated_length": 179.390625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.34161376953125, + "epoch": 1.2107843137254901, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6586332707412077, + "kl": 0.20643259584903717, + "learning_rate": 7.462892790678925e-07, + "loss": 0.0198, + "num_tokens": 31160276.0, + "reward": 0.65625, + "reward_std": 0.4597553312778473, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5905529260635376, + "sampling/importance_sampling_ratio/mean": 1.000783085823059, + "sampling/importance_sampling_ratio/min": 0.6451686024665833, + "sampling/sampling_logp_difference/max": 0.4640817642211914, + "sampling/sampling_logp_difference/mean": 0.015325578860938549, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 203.765625, + "completions/mean_terminated_length": 203.765625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.32241612672805786, + "epoch": 1.2120098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2146281829114274, + "kl": 0.07475961744785309, + "learning_rate": 7.456690684162556e-07, + "loss": 0.0035, + "num_tokens": 31187781.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.9699103832244873, + "sampling/importance_sampling_ratio/mean": 0.9992858171463013, + "sampling/importance_sampling_ratio/min": 0.25922614336013794, + "sampling/sampling_logp_difference/max": 1.350054383277893, + "sampling/sampling_logp_difference/mean": 0.016333363950252533, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 183.078125, + "completions/mean_terminated_length": 183.078125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3116165101528168, + "epoch": 1.213235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2658097084896718, + "kl": 0.09635313600301743, + "learning_rate": 7.450483590719736e-07, + "loss": -0.0309, + "num_tokens": 31228314.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.7604209184646606, + "sampling/importance_sampling_ratio/mean": 1.0004222393035889, + "sampling/importance_sampling_ratio/min": 0.4276825189590454, + "sampling/sampling_logp_difference/max": 0.8493741750717163, + "sampling/sampling_logp_difference/mean": 0.016576137393712997, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 165.109375, + "completions/mean_terminated_length": 165.109375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3262573480606079, + "epoch": 1.2144607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.240263089907162, + "kl": 0.09773312509059906, + "learning_rate": 7.444271522950468e-07, + "loss": -0.0025, + "num_tokens": 31255633.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5864391326904297, + "sampling/importance_sampling_ratio/mean": 1.0003514289855957, + "sampling/importance_sampling_ratio/min": 0.5683965682983398, + "sampling/sampling_logp_difference/max": 0.5649359226226807, + "sampling/sampling_logp_difference/mean": 0.014754691161215305, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 189.234375, + "completions/mean_terminated_length": 189.234375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.22865843772888184, + "epoch": 1.215686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036086823453182544, + "kl": 0.08012405782938004, + "learning_rate": 7.438054493464859e-07, + "loss": 0.0008, + "num_tokens": 31288560.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001484155654907, + "sampling/importance_sampling_ratio/min": 0.4468447268009186, + "sampling/sampling_logp_difference/max": 0.8055441379547119, + "sampling/sampling_logp_difference/mean": 0.012340845540165901, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 180.359375, + "completions/mean_terminated_length": 180.359375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.2875712513923645, + "epoch": 1.2169117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05782685284992099, + "kl": 0.10270027816295624, + "learning_rate": 7.431832514883081e-07, + "loss": 0.001, + "num_tokens": 31316295.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6122339963912964, + "sampling/importance_sampling_ratio/mean": 1.0004626512527466, + "sampling/importance_sampling_ratio/min": 0.5098070502281189, + "sampling/sampling_logp_difference/max": 0.6737229824066162, + "sampling/sampling_logp_difference/mean": 0.014431476593017578, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 164.796875, + "completions/mean_terminated_length": 164.796875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2780784070491791, + "epoch": 1.218137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.299354881607356, + "kl": 0.0973915159702301, + "learning_rate": 7.42560559983536e-07, + "loss": 0.0034, + "num_tokens": 31345178.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4421542882919312, + "sampling/importance_sampling_ratio/mean": 0.9997124075889587, + "sampling/importance_sampling_ratio/min": 0.590282142162323, + "sampling/sampling_logp_difference/max": 0.5271546244621277, + "sampling/sampling_logp_difference/mean": 0.014276275411248207, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 207.40625, + "completions/mean_terminated_length": 207.40625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.33022767305374146, + "epoch": 1.219362745098039, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9557828840884965, + "kl": 0.12490814924240112, + "learning_rate": 7.419373760961939e-07, + "loss": 0.0236, + "num_tokens": 31378676.0, + "reward": 0.84375, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996752738952637, + "sampling/importance_sampling_ratio/min": 0.5823451280593872, + "sampling/sampling_logp_difference/max": 1.0023880004882812, + "sampling/sampling_logp_difference/mean": 0.016804654151201248, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 179.28125, + "completions/mean_terminated_length": 179.28125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.30481797456741333, + "epoch": 1.2205882352941178, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.502793416021743, + "kl": 0.153112530708313, + "learning_rate": 7.413137010913054e-07, + "loss": -0.062, + "num_tokens": 31405878.0, + "reward": -0.03125, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6448822021484375, + "sampling/importance_sampling_ratio/mean": 0.9992611408233643, + "sampling/importance_sampling_ratio/min": 0.4682300388813019, + "sampling/sampling_logp_difference/max": 0.7587955594062805, + "sampling/sampling_logp_difference/mean": 0.015971675515174866, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 218.828125, + "completions/mean_terminated_length": 218.828125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.37005671858787537, + "epoch": 1.221813725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6671530498676614, + "kl": 0.14781561493873596, + "learning_rate": 7.406895362348915e-07, + "loss": -0.029, + "num_tokens": 31442507.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.7278977632522583, + "sampling/importance_sampling_ratio/mean": 1.0003068447113037, + "sampling/importance_sampling_ratio/min": 0.6104914546012878, + "sampling/sampling_logp_difference/max": 0.546905517578125, + "sampling/sampling_logp_difference/mean": 0.016775382682681084, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 197.109375, + "completions/mean_terminated_length": 197.109375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3034955859184265, + "epoch": 1.2230392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0030801566549996, + "kl": 0.10263898223638535, + "learning_rate": 7.400648827939671e-07, + "loss": 0.0125, + "num_tokens": 31473474.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6770514249801636, + "sampling/importance_sampling_ratio/mean": 0.9998710751533508, + "sampling/importance_sampling_ratio/min": 0.6035663485527039, + "sampling/sampling_logp_difference/max": 0.5170371532440186, + "sampling/sampling_logp_difference/mean": 0.014926435425877571, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 200.125, + "completions/mean_terminated_length": 200.125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.363971471786499, + "epoch": 1.224264705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2002474362045186, + "kl": 0.12655779719352722, + "learning_rate": 7.394397420365392e-07, + "loss": 0.0484, + "num_tokens": 31505114.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.524776577949524, + "sampling/importance_sampling_ratio/mean": 0.9999206066131592, + "sampling/importance_sampling_ratio/min": 0.47316774725914, + "sampling/sampling_logp_difference/max": 0.7483053207397461, + "sampling/sampling_logp_difference/mean": 0.014686089009046555, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 184.578125, + "completions/mean_terminated_length": 184.578125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.26632946729660034, + "epoch": 1.2254901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5162950548729182, + "kl": 0.07559464871883392, + "learning_rate": 7.388141152316038e-07, + "loss": -0.0356, + "num_tokens": 31533007.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6545816659927368, + "sampling/importance_sampling_ratio/mean": 1.0002281665802002, + "sampling/importance_sampling_ratio/min": 0.24965912103652954, + "sampling/sampling_logp_difference/max": 1.3876588344573975, + "sampling/sampling_logp_difference/mean": 0.01353788748383522, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 153.421875, + "completions/mean_terminated_length": 153.421875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.24606212973594666, + "epoch": 1.2267156862745099, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.521539633292881, + "kl": 0.11632443964481354, + "learning_rate": 7.381880036491439e-07, + "loss": 0.0603, + "num_tokens": 31555610.0, + "reward": 0.71875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.6533600091934204, + "sampling/importance_sampling_ratio/mean": 0.9998239278793335, + "sampling/importance_sampling_ratio/min": 0.5483725070953369, + "sampling/sampling_logp_difference/max": 0.6008005142211914, + "sampling/sampling_logp_difference/mean": 0.013158413581550121, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 231.484375, + "completions/mean_terminated_length": 231.484375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.38222363591194153, + "epoch": 1.2279411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.523169943336086, + "kl": 0.11676372587680817, + "learning_rate": 7.375614085601264e-07, + "loss": -0.0913, + "num_tokens": 31591001.0, + "reward": 0.125, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999423623085022, + "sampling/importance_sampling_ratio/min": 0.6058565974235535, + "sampling/sampling_logp_difference/max": 1.1164112091064453, + "sampling/sampling_logp_difference/mean": 0.01587112993001938, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 163.46875, + "completions/mean_terminated_length": 163.46875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.30908840894699097, + "epoch": 1.2291666666666667, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.991581597290809, + "kl": 0.21817518770694733, + "learning_rate": 7.369343312364993e-07, + "loss": 0.0015, + "num_tokens": 31616423.0, + "reward": 0.625, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.9695045948028564, + "sampling/importance_sampling_ratio/mean": 0.9995729923248291, + "sampling/importance_sampling_ratio/min": 0.6912212371826172, + "sampling/sampling_logp_difference/max": 0.6777820587158203, + "sampling/sampling_logp_difference/mean": 0.015119457617402077, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 179.671875, + "completions/mean_terminated_length": 179.671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3759731650352478, + "epoch": 1.2303921568627452, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0610345809582244, + "kl": 0.17271912097930908, + "learning_rate": 7.363067729511901e-07, + "loss": 0.0265, + "num_tokens": 31648354.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000411033630371, + "sampling/importance_sampling_ratio/min": 0.5543676614761353, + "sampling/sampling_logp_difference/max": 0.7612829208374023, + "sampling/sampling_logp_difference/mean": 0.016802560538053513, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 202.625, + "completions/mean_terminated_length": 202.625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3747866153717041, + "epoch": 1.2316176470588236, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8112533158466604, + "kl": 0.12410871684551239, + "learning_rate": 7.356787349781022e-07, + "loss": 0.0169, + "num_tokens": 31681882.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5346338748931885, + "sampling/importance_sampling_ratio/mean": 0.9999476671218872, + "sampling/importance_sampling_ratio/min": 0.5783976912498474, + "sampling/sampling_logp_difference/max": 0.5474936962127686, + "sampling/sampling_logp_difference/mean": 0.017098799347877502, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 202.734375, + "completions/mean_terminated_length": 202.734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3643556833267212, + "epoch": 1.232843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7475408566935873, + "kl": 0.15187852084636688, + "learning_rate": 7.350502185921131e-07, + "loss": -0.0309, + "num_tokens": 31713049.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.8180066347122192, + "sampling/importance_sampling_ratio/mean": 1.0002341270446777, + "sampling/importance_sampling_ratio/min": 0.6139999032020569, + "sampling/sampling_logp_difference/max": 0.597740650177002, + "sampling/sampling_logp_difference/mean": 0.016775190830230713, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 186.890625, + "completions/mean_terminated_length": 186.890625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.34378689527511597, + "epoch": 1.2340686274509804, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.246849376728109, + "kl": 0.16027387976646423, + "learning_rate": 7.344212250690711e-07, + "loss": 0.0148, + "num_tokens": 31739170.0, + "reward": 0.28125, + "reward_std": 0.6337460875511169, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6463134288787842, + "sampling/importance_sampling_ratio/mean": 0.99985671043396, + "sampling/importance_sampling_ratio/min": 0.4597095847129822, + "sampling/sampling_logp_difference/max": 0.7771602869033813, + "sampling/sampling_logp_difference/mean": 0.01568588614463806, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 207.3125, + "completions/mean_terminated_length": 207.3125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.27392303943634033, + "epoch": 1.2352941176470589, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1483814390497153, + "kl": 0.1423334777355194, + "learning_rate": 7.337917556857934e-07, + "loss": 0.0245, + "num_tokens": 31773126.0, + "reward": 0.53125, + "reward_std": 0.565913200378418, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6795238256454468, + "sampling/importance_sampling_ratio/mean": 1.0003316402435303, + "sampling/importance_sampling_ratio/min": 0.6329535245895386, + "sampling/sampling_logp_difference/max": 0.5185103416442871, + "sampling/sampling_logp_difference/mean": 0.01408257894217968, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 188.578125, + "completions/mean_terminated_length": 188.578125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3587127923965454, + "epoch": 1.2365196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0526170599458136, + "kl": 0.13530303537845612, + "learning_rate": 7.331618117200625e-07, + "loss": 0.0041, + "num_tokens": 31805003.0, + "reward": -0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6602283716201782, + "sampling/importance_sampling_ratio/mean": 1.0001368522644043, + "sampling/importance_sampling_ratio/min": 0.42615729570388794, + "sampling/sampling_logp_difference/max": 0.8529467582702637, + "sampling/sampling_logp_difference/mean": 0.015614226460456848, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 174.96875, + "completions/mean_terminated_length": 174.96875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2448197603225708, + "epoch": 1.2377450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03888673529603373, + "kl": 0.08840734511613846, + "learning_rate": 7.325313944506253e-07, + "loss": 0.0009, + "num_tokens": 31836025.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004358291625977, + "sampling/importance_sampling_ratio/min": 0.22012530267238617, + "sampling/sampling_logp_difference/max": 1.5135583877563477, + "sampling/sampling_logp_difference/mean": 0.013241377659142017, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 158.078125, + "completions/mean_terminated_length": 158.078125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.20651528239250183, + "epoch": 1.2389705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04904226566753591, + "kl": 0.07498851418495178, + "learning_rate": 7.319005051571885e-07, + "loss": 0.0007, + "num_tokens": 31861294.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996029138565063, + "sampling/importance_sampling_ratio/min": 0.6086891293525696, + "sampling/sampling_logp_difference/max": 0.8189167976379395, + "sampling/sampling_logp_difference/mean": 0.012868411839008331, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 194.046875, + "completions/mean_terminated_length": 194.046875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3027922809123993, + "epoch": 1.2401960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2038356318333772, + "kl": 0.09024074673652649, + "learning_rate": 7.312691451204177e-07, + "loss": -0.0035, + "num_tokens": 31894001.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5910779237747192, + "sampling/importance_sampling_ratio/mean": 1.0005966424942017, + "sampling/importance_sampling_ratio/min": 0.604347288608551, + "sampling/sampling_logp_difference/max": 0.5036063194274902, + "sampling/sampling_logp_difference/mean": 0.01586068421602249, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 158.234375, + "completions/mean_terminated_length": 158.234375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.19949010014533997, + "epoch": 1.241421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06606497988914023, + "kl": 0.08616028726100922, + "learning_rate": 7.306373156219335e-07, + "loss": 0.0009, + "num_tokens": 31918672.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6798814535140991, + "sampling/importance_sampling_ratio/mean": 1.0005948543548584, + "sampling/importance_sampling_ratio/min": 0.6569399833679199, + "sampling/sampling_logp_difference/max": 0.5187232494354248, + "sampling/sampling_logp_difference/mean": 0.010515892878174782, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 223.390625, + "completions/mean_terminated_length": 223.390625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.33404579758644104, + "epoch": 1.2426470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9159336896469366, + "kl": 0.09262588620185852, + "learning_rate": 7.300050179443099e-07, + "loss": 0.0153, + "num_tokens": 31952777.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997861981391907, + "sampling/importance_sampling_ratio/min": 0.43403762578964233, + "sampling/sampling_logp_difference/max": 0.8346240520477295, + "sampling/sampling_logp_difference/mean": 0.015576720237731934, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 192.9375, + "completions/mean_terminated_length": 192.9375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.4179092347621918, + "epoch": 1.2438725490196079, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.142086344004228, + "kl": 0.15286509692668915, + "learning_rate": 7.293722533710714e-07, + "loss": 0.0182, + "num_tokens": 31999365.0, + "reward": 0.09375, + "reward_std": 0.676956295967102, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5148899555206299, + "sampling/importance_sampling_ratio/mean": 0.9996585249900818, + "sampling/importance_sampling_ratio/min": 0.3519838750362396, + "sampling/sampling_logp_difference/max": 1.0441699028015137, + "sampling/sampling_logp_difference/mean": 0.019419480115175247, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 179.125, + "completions/mean_terminated_length": 179.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.34504783153533936, + "epoch": 1.2450980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.875871222576556, + "kl": 0.15173397958278656, + "learning_rate": 7.287390231866893e-07, + "loss": -0.043, + "num_tokens": 32026509.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6614950895309448, + "sampling/importance_sampling_ratio/mean": 1.0002130270004272, + "sampling/importance_sampling_ratio/min": 0.4344994127750397, + "sampling/sampling_logp_difference/max": 0.8335607051849365, + "sampling/sampling_logp_difference/mean": 0.017975609749555588, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 207.171875, + "completions/mean_terminated_length": 207.171875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.27351927757263184, + "epoch": 1.2463235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8365602884455642, + "kl": 0.13922347128391266, + "learning_rate": 7.281053286765815e-07, + "loss": -0.0077, + "num_tokens": 32057976.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.829240083694458, + "sampling/importance_sampling_ratio/mean": 0.9998570680618286, + "sampling/importance_sampling_ratio/min": 0.475933700799942, + "sampling/sampling_logp_difference/max": 0.7424767017364502, + "sampling/sampling_logp_difference/mean": 0.013677925802767277, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 235.953125, + "completions/mean_terminated_length": 235.953125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.31339412927627563, + "epoch": 1.2475490196078431, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.751014458954254, + "kl": 0.12213044613599777, + "learning_rate": 7.274711711271073e-07, + "loss": 0.0458, + "num_tokens": 32090341.0, + "reward": 0.4375, + "reward_std": 0.6707825064659119, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6643818616867065, + "sampling/importance_sampling_ratio/mean": 1.0001444816589355, + "sampling/importance_sampling_ratio/min": 0.5219727158546448, + "sampling/sampling_logp_difference/max": 0.6501400470733643, + "sampling/sampling_logp_difference/mean": 0.015251624397933483, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 204.296875, + "completions/mean_terminated_length": 204.296875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2566416263580322, + "epoch": 1.2487745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6707983897426926, + "kl": 0.11317183077335358, + "learning_rate": 7.268365518255665e-07, + "loss": -0.0106, + "num_tokens": 32119048.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.8755946159362793, + "sampling/importance_sampling_ratio/mean": 1.000259280204773, + "sampling/importance_sampling_ratio/min": 0.5523190498352051, + "sampling/sampling_logp_difference/max": 0.6289258003234863, + "sampling/sampling_logp_difference/mean": 0.014619041234254837, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 200.21875, + "completions/mean_terminated_length": 200.21875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.347176194190979, + "epoch": 1.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06262725495643849, + "kl": 0.15429773926734924, + "learning_rate": 7.262014720601958e-07, + "loss": 0.0014, + "num_tokens": 32159574.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998721480369568, + "sampling/importance_sampling_ratio/min": 0.5038028955459595, + "sampling/sampling_logp_difference/max": 0.7020881175994873, + "sampling/sampling_logp_difference/mean": 0.017093859612941742, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 242.828125, + "completions/mean_terminated_length": 242.828125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2813074588775635, + "epoch": 1.2512254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4754270795558369, + "kl": 0.07705482840538025, + "learning_rate": 7.255659331201673e-07, + "loss": -0.0177, + "num_tokens": 32196523.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6573729515075684, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.5038020610809326, + "sampling/sampling_logp_difference/max": 0.6855719089508057, + "sampling/sampling_logp_difference/mean": 0.014529230073094368, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 190.859375, + "completions/mean_terminated_length": 190.859375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3343523442745209, + "epoch": 1.2524509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7682001494721453, + "kl": 0.12435860931873322, + "learning_rate": 7.249299362955845e-07, + "loss": -0.0104, + "num_tokens": 32230322.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6121718883514404, + "sampling/importance_sampling_ratio/mean": 1.0002822875976562, + "sampling/importance_sampling_ratio/min": 0.6152330040931702, + "sampling/sampling_logp_difference/max": 0.48575425148010254, + "sampling/sampling_logp_difference/mean": 0.016807734966278076, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 163.984375, + "completions/mean_terminated_length": 163.984375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3668976426124573, + "epoch": 1.2536764705882353, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.507965779502531, + "kl": 0.23807550966739655, + "learning_rate": 7.242934828774808e-07, + "loss": -0.0069, + "num_tokens": 32264401.0, + "reward": -0.125, + "reward_std": 0.6047805547714233, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4725977182388306, + "sampling/importance_sampling_ratio/mean": 0.9996072053909302, + "sampling/importance_sampling_ratio/min": 0.5234352350234985, + "sampling/sampling_logp_difference/max": 0.6473419666290283, + "sampling/sampling_logp_difference/mean": 0.019045095890760422, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 205.21875, + "completions/mean_terminated_length": 205.21875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4307796359062195, + "epoch": 1.2549019607843137, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9349461135817945, + "kl": 0.1441027671098709, + "learning_rate": 7.236565741578162e-07, + "loss": 0.0065, + "num_tokens": 32297695.0, + "reward": 0.40625, + "reward_std": 0.6331988573074341, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6223868131637573, + "sampling/importance_sampling_ratio/mean": 0.9998844265937805, + "sampling/importance_sampling_ratio/min": 0.576189398765564, + "sampling/sampling_logp_difference/max": 0.551318883895874, + "sampling/sampling_logp_difference/mean": 0.018998097628355026, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 228.09375, + "completions/mean_terminated_length": 228.09375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3047144412994385, + "epoch": 1.2561274509803921, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8028179518326652, + "kl": 0.07948464155197144, + "learning_rate": 7.230192114294753e-07, + "loss": -0.0933, + "num_tokens": 32330021.0, + "reward": 0.53125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4912073612213135, + "sampling/importance_sampling_ratio/mean": 1.0001463890075684, + "sampling/importance_sampling_ratio/min": 0.47132372856140137, + "sampling/sampling_logp_difference/max": 0.7522101402282715, + "sampling/sampling_logp_difference/mean": 0.014421386644244194, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 224.25, + "completions/mean_terminated_length": 224.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3037158250808716, + "epoch": 1.2573529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5394101784886198, + "kl": 0.0990850031375885, + "learning_rate": 7.223813959862638e-07, + "loss": 0.0078, + "num_tokens": 32360069.0, + "reward": 0.03125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000302791595459, + "sampling/importance_sampling_ratio/min": 0.4870549738407135, + "sampling/sampling_logp_difference/max": 0.7454228401184082, + "sampling/sampling_logp_difference/mean": 0.01469477266073227, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 200.640625, + "completions/mean_terminated_length": 200.640625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3100473880767822, + "epoch": 1.258578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9361945149371205, + "kl": 0.121149942278862, + "learning_rate": 7.217431291229067e-07, + "loss": -0.0163, + "num_tokens": 32392846.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007360219955444, + "sampling/importance_sampling_ratio/mean": 0.9996852874755859, + "sampling/importance_sampling_ratio/min": 0.4947281777858734, + "sampling/sampling_logp_difference/max": 0.7037467956542969, + "sampling/sampling_logp_difference/mean": 0.016091354191303253, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 179.90625, + "completions/mean_terminated_length": 179.90625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2418641746044159, + "epoch": 1.2598039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.861874824330707, + "kl": 0.237470343708992, + "learning_rate": 7.211044121350454e-07, + "loss": 0.016, + "num_tokens": 32420632.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.9645437002182007, + "sampling/importance_sampling_ratio/mean": 0.9993852972984314, + "sampling/importance_sampling_ratio/min": 0.08299688994884491, + "sampling/sampling_logp_difference/max": 2.488952159881592, + "sampling/sampling_logp_difference/mean": 0.015440763905644417, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 177.828125, + "completions/mean_terminated_length": 177.828125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.29346632957458496, + "epoch": 1.2610294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3610814823819088, + "kl": 0.13688868284225464, + "learning_rate": 7.204652463192347e-07, + "loss": -0.0255, + "num_tokens": 32452589.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5071556568145752, + "sampling/importance_sampling_ratio/mean": 1.000331163406372, + "sampling/importance_sampling_ratio/min": 0.41115984320640564, + "sampling/sampling_logp_difference/max": 0.8887733221054077, + "sampling/sampling_logp_difference/mean": 0.01535726711153984, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 223.3125, + "completions/mean_terminated_length": 223.3125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3583295941352844, + "epoch": 1.2622549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4573937518867763, + "kl": 0.16331440210342407, + "learning_rate": 7.198256329729411e-07, + "loss": -0.0145, + "num_tokens": 32489489.0, + "reward": 0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6254981756210327, + "sampling/importance_sampling_ratio/mean": 0.9997801780700684, + "sampling/importance_sampling_ratio/min": 0.5264105200767517, + "sampling/sampling_logp_difference/max": 0.6416739225387573, + "sampling/sampling_logp_difference/mean": 0.016224460676312447, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 173.25, + "completions/mean_terminated_length": 173.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3241284489631653, + "epoch": 1.2634803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8754633261538256, + "kl": 0.18393567204475403, + "learning_rate": 7.191855733945386e-07, + "loss": -0.0036, + "num_tokens": 32525889.0, + "reward": 0.78125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.737408995628357, + "sampling/importance_sampling_ratio/mean": 0.9993525147438049, + "sampling/importance_sampling_ratio/min": 0.4397071599960327, + "sampling/sampling_logp_difference/max": 0.8216463327407837, + "sampling/sampling_logp_difference/mean": 0.015761353075504303, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 198.90625, + "completions/mean_terminated_length": 198.90625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3429109752178192, + "epoch": 1.2647058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7782923039338083, + "kl": 0.14510124921798706, + "learning_rate": 7.185450688833083e-07, + "loss": 0.0195, + "num_tokens": 32555339.0, + "reward": 0.3125, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.5084730982780457, + "sampling/sampling_logp_difference/max": 0.8317980766296387, + "sampling/sampling_logp_difference/mean": 0.01605771854519844, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 173.734375, + "completions/mean_terminated_length": 173.734375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.38110750913619995, + "epoch": 1.2659313725490196, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3586280279621117, + "kl": 0.18957901000976562, + "learning_rate": 7.179041207394331e-07, + "loss": -0.0105, + "num_tokens": 32583706.0, + "reward": 0.5, + "reward_std": 0.6143567562103271, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.532178521156311, + "sampling/importance_sampling_ratio/mean": 1.0000102519989014, + "sampling/importance_sampling_ratio/min": 0.595439076423645, + "sampling/sampling_logp_difference/max": 0.518456220626831, + "sampling/sampling_logp_difference/mean": 0.016505541279911995, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 191.34375, + "completions/mean_terminated_length": 191.34375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.34220486879348755, + "epoch": 1.267156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05646084522535247, + "kl": 0.12253264337778091, + "learning_rate": 7.172627302639975e-07, + "loss": 0.0012, + "num_tokens": 32619664.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6237683296203613, + "sampling/importance_sampling_ratio/mean": 0.9988462924957275, + "sampling/importance_sampling_ratio/min": 0.37885966897010803, + "sampling/sampling_logp_difference/max": 0.9705893993377686, + "sampling/sampling_logp_difference/mean": 0.017415886744856834, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 292.4375, + "completions/mean_terminated_length": 292.4375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.366678923368454, + "epoch": 1.2683823529411764, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6286243291552, + "kl": 0.09592580795288086, + "learning_rate": 7.166208987589836e-07, + "loss": 0.0442, + "num_tokens": 32653788.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995908141136169, + "sampling/importance_sampling_ratio/min": 0.5976837277412415, + "sampling/sampling_logp_difference/max": 0.7184348106384277, + "sampling/sampling_logp_difference/mean": 0.016546351835131645, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 175.3125, + "completions/mean_terminated_length": 175.3125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2045753300189972, + "epoch": 1.2696078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0608310464490693, + "kl": 0.11045706272125244, + "learning_rate": 7.159786275272686e-07, + "loss": -0.0054, + "num_tokens": 32680064.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6443026065826416, + "sampling/importance_sampling_ratio/mean": 1.0000745058059692, + "sampling/importance_sampling_ratio/min": 0.6105195879936218, + "sampling/sampling_logp_difference/max": 0.4973163604736328, + "sampling/sampling_logp_difference/mean": 0.011742750182747841, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 178.84375, + "completions/mean_terminated_length": 178.84375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3212544918060303, + "epoch": 1.2708333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0584832092626283, + "kl": 0.14732259511947632, + "learning_rate": 7.153359178726221e-07, + "loss": 0.0016, + "num_tokens": 32706326.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.96951162815094, + "sampling/importance_sampling_ratio/mean": 0.9999465346336365, + "sampling/importance_sampling_ratio/min": 0.06851734220981598, + "sampling/sampling_logp_difference/max": 2.680668354034424, + "sampling/sampling_logp_difference/mean": 0.015915201976895332, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 222.84375, + "completions/mean_terminated_length": 222.84375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4380054175853729, + "epoch": 1.2720588235294117, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.928208629225113, + "kl": 0.18012815713882446, + "learning_rate": 7.146927710997046e-07, + "loss": -0.0014, + "num_tokens": 32737740.0, + "reward": 0.6875, + "reward_std": 0.5915650129318237, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4524939060211182, + "sampling/importance_sampling_ratio/mean": 0.9998493790626526, + "sampling/importance_sampling_ratio/min": 0.6089973449707031, + "sampling/sampling_logp_difference/max": 0.4959414005279541, + "sampling/sampling_logp_difference/mean": 0.017109330743551254, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 233.828125, + "completions/mean_terminated_length": 233.828125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3463178873062134, + "epoch": 1.2732843137254901, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7030425356642425, + "kl": 0.13462179899215698, + "learning_rate": 7.140491885140628e-07, + "loss": 0.0272, + "num_tokens": 32768657.0, + "reward": 0.0, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.917688250541687, + "sampling/importance_sampling_ratio/mean": 1.000036597251892, + "sampling/importance_sampling_ratio/min": 0.6179336309432983, + "sampling/sampling_logp_difference/max": 0.6511204242706299, + "sampling/sampling_logp_difference/mean": 0.015579458326101303, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.34624820947647095, + "epoch": 1.2745098039215685, + "frac_reward_zero_std": 0.25, + "grad_norm": 10.609165028958328, + "kl": 0.14227654039859772, + "learning_rate": 7.134051714221286e-07, + "loss": 0.0249, + "num_tokens": 32805935.0, + "reward": 0.5, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7949460744857788, + "sampling/importance_sampling_ratio/mean": 0.9997808933258057, + "sampling/importance_sampling_ratio/min": 0.38582944869995117, + "sampling/sampling_logp_difference/max": 0.9523599147796631, + "sampling/sampling_logp_difference/mean": 0.01671173796057701, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 186.71875, + "completions/mean_terminated_length": 186.71875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.22002194821834564, + "epoch": 1.2757352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12063033022994081, + "kl": 0.0766913890838623, + "learning_rate": 7.127607211312162e-07, + "loss": 0.0008, + "num_tokens": 32832333.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.498414158821106, + "sampling/importance_sampling_ratio/mean": 0.9999425411224365, + "sampling/importance_sampling_ratio/min": 0.3760538101196289, + "sampling/sampling_logp_difference/max": 0.9780230522155762, + "sampling/sampling_logp_difference/mean": 0.012133192270994186, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 264.484375, + "completions/mean_terminated_length": 264.484375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.2804335057735443, + "epoch": 1.2769607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0508564541259473, + "kl": 0.09116211533546448, + "learning_rate": 7.121158389495185e-07, + "loss": 0.0049, + "num_tokens": 32865452.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003582239151, + "sampling/importance_sampling_ratio/min": 0.34089013934135437, + "sampling/sampling_logp_difference/max": 1.2095816135406494, + "sampling/sampling_logp_difference/mean": 0.01316175889223814, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 285.703125, + "completions/mean_terminated_length": 285.703125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3882972300052643, + "epoch": 1.278186274509804, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6799637917123296, + "kl": 0.12771184742450714, + "learning_rate": 7.114705261861061e-07, + "loss": 0.0512, + "num_tokens": 32908553.0, + "reward": 0.125, + "reward_std": 0.644389271736145, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995465278625488, + "sampling/importance_sampling_ratio/min": 0.4101741909980774, + "sampling/sampling_logp_difference/max": 0.8911733627319336, + "sampling/sampling_logp_difference/mean": 0.017560133710503578, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 243.359375, + "completions/mean_terminated_length": 243.359375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.26590538024902344, + "epoch": 1.2794117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4105436688831778, + "kl": 0.14571696519851685, + "learning_rate": 7.108247841509222e-07, + "loss": 0.0174, + "num_tokens": 32936736.0, + "reward": 0.34375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998252391815186, + "sampling/importance_sampling_ratio/min": 0.4056454002857208, + "sampling/sampling_logp_difference/max": 0.9022759199142456, + "sampling/sampling_logp_difference/mean": 0.013069100677967072, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 186.046875, + "completions/mean_terminated_length": 186.046875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.30763909220695496, + "epoch": 1.280637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.081812013135514, + "kl": 0.2121243178844452, + "learning_rate": 7.101786141547828e-07, + "loss": 0.0059, + "num_tokens": 32964211.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6281697750091553, + "sampling/importance_sampling_ratio/mean": 1.0008862018585205, + "sampling/importance_sampling_ratio/min": 0.62428218126297, + "sampling/sampling_logp_difference/max": 0.4874565601348877, + "sampling/sampling_logp_difference/mean": 0.015111139044165611, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 233.28125, + "completions/mean_terminated_length": 233.28125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.30027326941490173, + "epoch": 1.281862745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.806171699479573, + "kl": 0.0992993712425232, + "learning_rate": 7.095320175093718e-07, + "loss": 0.0359, + "num_tokens": 32994757.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5194125175476074, + "sampling/importance_sampling_ratio/mean": 1.000177025794983, + "sampling/importance_sampling_ratio/min": 0.6330479383468628, + "sampling/sampling_logp_difference/max": 0.45720911026000977, + "sampling/sampling_logp_difference/mean": 0.013746829703450203, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 188.65625, + "completions/mean_terminated_length": 188.65625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.21341879665851593, + "epoch": 1.2830882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1331586318601023, + "kl": 0.14237648248672485, + "learning_rate": 7.088849955272396e-07, + "loss": 0.0338, + "num_tokens": 33022623.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995142221450806, + "sampling/importance_sampling_ratio/min": 0.6229760050773621, + "sampling/sampling_logp_difference/max": 1.130134105682373, + "sampling/sampling_logp_difference/mean": 0.011872018687427044, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 187.078125, + "completions/mean_terminated_length": 187.078125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.361431747674942, + "epoch": 1.284313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6351310888941764, + "kl": 0.1553775668144226, + "learning_rate": 7.082375495217995e-07, + "loss": 0.0002, + "num_tokens": 33050388.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994962215423584, + "sampling/importance_sampling_ratio/min": 0.06389383971691132, + "sampling/sampling_logp_difference/max": 2.750532388687134, + "sampling/sampling_logp_difference/mean": 0.01660950854420662, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 175.484375, + "completions/mean_terminated_length": 175.484375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2529153823852539, + "epoch": 1.2855392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05582279030978907, + "kl": 0.10453030467033386, + "learning_rate": 7.075896808073263e-07, + "loss": 0.001, + "num_tokens": 33079779.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.861152172088623, + "sampling/importance_sampling_ratio/mean": 1.001260757446289, + "sampling/importance_sampling_ratio/min": 0.51512211561203, + "sampling/sampling_logp_difference/max": 0.66335129737854, + "sampling/sampling_logp_difference/mean": 0.013509858399629593, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 233.5, + "completions/mean_terminated_length": 233.5, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.2999563217163086, + "epoch": 1.2867647058823528, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4515053218908125, + "kl": 0.12369360029697418, + "learning_rate": 7.069413906989523e-07, + "loss": 0.0016, + "num_tokens": 33112915.0, + "reward": 0.3125, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5768550634384155, + "sampling/importance_sampling_ratio/mean": 0.9995861053466797, + "sampling/importance_sampling_ratio/min": 0.6017664074897766, + "sampling/sampling_logp_difference/max": 0.5078859329223633, + "sampling/sampling_logp_difference/mean": 0.013802142813801765, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 222.390625, + "completions/mean_terminated_length": 222.390625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3194698095321655, + "epoch": 1.2879901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1232975024968728, + "kl": 0.18103843927383423, + "learning_rate": 7.062926805126652e-07, + "loss": 0.0108, + "num_tokens": 33145356.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006964206695557, + "sampling/importance_sampling_ratio/min": 0.48957717418670654, + "sampling/sampling_logp_difference/max": 0.9625020027160645, + "sampling/sampling_logp_difference/mean": 0.016171928495168686, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 240.921875, + "completions/mean_terminated_length": 240.921875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.30571770668029785, + "epoch": 1.2892156862745099, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0322747904837823, + "kl": 0.09930259734392166, + "learning_rate": 7.056435515653058e-07, + "loss": 0.0178, + "num_tokens": 33177431.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5560518503189087, + "sampling/importance_sampling_ratio/mean": 0.9996767044067383, + "sampling/importance_sampling_ratio/min": 0.502656102180481, + "sampling/sampling_logp_difference/max": 0.6878490447998047, + "sampling/sampling_logp_difference/mean": 0.01476350612938404, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 204.0, + "completions/mean_terminated_length": 204.0, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.339089035987854, + "epoch": 1.2904411764705883, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.093188298417368, + "kl": 0.16841793060302734, + "learning_rate": 7.049940051745646e-07, + "loss": 0.023, + "num_tokens": 33206151.0, + "reward": 0.5625, + "reward_std": 0.6663130521774292, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7607289552688599, + "sampling/importance_sampling_ratio/mean": 0.999548614025116, + "sampling/importance_sampling_ratio/min": 0.5105680227279663, + "sampling/sampling_logp_difference/max": 0.6722314357757568, + "sampling/sampling_logp_difference/mean": 0.0169211458414793, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 230.078125, + "completions/mean_terminated_length": 230.078125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.37655603885650635, + "epoch": 1.2916666666666667, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5857500327224365, + "kl": 0.15081095695495605, + "learning_rate": 7.043440426589795e-07, + "loss": -0.0454, + "num_tokens": 33243196.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8727679252624512, + "sampling/importance_sampling_ratio/mean": 0.9996587634086609, + "sampling/importance_sampling_ratio/min": 0.5912025570869446, + "sampling/sampling_logp_difference/max": 0.6274175643920898, + "sampling/sampling_logp_difference/mean": 0.01732090674340725, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 199.984375, + "completions/mean_terminated_length": 199.984375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.22158865630626678, + "epoch": 1.2928921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05580966844163945, + "kl": 0.12035124748945236, + "learning_rate": 7.036936653379335e-07, + "loss": 0.0011, + "num_tokens": 33274587.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999060034751892, + "sampling/importance_sampling_ratio/min": 0.5996983647346497, + "sampling/sampling_logp_difference/max": 0.901757001876831, + "sampling/sampling_logp_difference/mean": 0.013135458342730999, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 330.59375, + "completions/mean_terminated_length": 330.59375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.39374738931655884, + "epoch": 1.2941176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7205187284383933, + "kl": 0.11873811483383179, + "learning_rate": 7.030428745316512e-07, + "loss": -0.0245, + "num_tokens": 33318177.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.62209153175354, + "sampling/importance_sampling_ratio/mean": 0.9998387098312378, + "sampling/importance_sampling_ratio/min": 0.43553099036216736, + "sampling/sampling_logp_difference/max": 0.8311893939971924, + "sampling/sampling_logp_difference/mean": 0.016345981508493423, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 202.0625, + "completions/mean_terminated_length": 202.0625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.31683558225631714, + "epoch": 1.295343137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5122700222221295, + "kl": 0.13294917345046997, + "learning_rate": 7.023916715611968e-07, + "loss": 0.0392, + "num_tokens": 33351877.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4794986248016357, + "sampling/importance_sampling_ratio/mean": 0.9991467595100403, + "sampling/importance_sampling_ratio/min": 0.6412928104400635, + "sampling/sampling_logp_difference/max": 0.44426918029785156, + "sampling/sampling_logp_difference/mean": 0.015316734090447426, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 224.21875, + "completions/mean_terminated_length": 224.21875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.35916852951049805, + "epoch": 1.2965686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8416802163181198, + "kl": 0.13629162311553955, + "learning_rate": 7.017400577484712e-07, + "loss": -0.0102, + "num_tokens": 33382803.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.8823031187057495, + "sampling/importance_sampling_ratio/mean": 0.99980229139328, + "sampling/importance_sampling_ratio/min": 0.637050986289978, + "sampling/sampling_logp_difference/max": 0.6324961185455322, + "sampling/sampling_logp_difference/mean": 0.01551731489598751, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 191.046875, + "completions/mean_terminated_length": 191.046875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2954980731010437, + "epoch": 1.2977941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052997823499318364, + "kl": 0.13235683739185333, + "learning_rate": 7.010880344162086e-07, + "loss": 0.0013, + "num_tokens": 33413926.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7516822814941406, + "sampling/importance_sampling_ratio/mean": 0.9992790222167969, + "sampling/importance_sampling_ratio/min": 0.5718526840209961, + "sampling/sampling_logp_difference/max": 0.5605766773223877, + "sampling/sampling_logp_difference/mean": 0.01648583449423313, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 275.078125, + "completions/mean_terminated_length": 275.078125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2996733784675598, + "epoch": 1.2990196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.298984373440655, + "kl": 0.09220343828201294, + "learning_rate": 7.004356028879758e-07, + "loss": 0.0233, + "num_tokens": 33451787.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6204760074615479, + "sampling/importance_sampling_ratio/mean": 0.9997023344039917, + "sampling/importance_sampling_ratio/min": 0.629865288734436, + "sampling/sampling_logp_difference/max": 0.48271989822387695, + "sampling/sampling_logp_difference/mean": 0.01601042039692402, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 205.625, + "completions/mean_terminated_length": 205.625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.24982041120529175, + "epoch": 1.3002450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24166632185575956, + "kl": 0.1469212919473648, + "learning_rate": 6.99782764488167e-07, + "loss": 0.0016, + "num_tokens": 33483859.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8205466270446777, + "sampling/importance_sampling_ratio/mean": 0.9996812343597412, + "sampling/importance_sampling_ratio/min": 0.6778234839439392, + "sampling/sampling_logp_difference/max": 0.5991368293762207, + "sampling/sampling_logp_difference/mean": 0.011886507272720337, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 246.671875, + "completions/mean_terminated_length": 246.671875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.2619977593421936, + "epoch": 1.3014705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9328385514279917, + "kl": 0.10800004750490189, + "learning_rate": 6.991295205420027e-07, + "loss": 0.0032, + "num_tokens": 33517422.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.574527382850647, + "sampling/importance_sampling_ratio/mean": 1.000114917755127, + "sampling/importance_sampling_ratio/min": 0.5232529640197754, + "sampling/sampling_logp_difference/max": 0.6476902961730957, + "sampling/sampling_logp_difference/mean": 0.013545207679271698, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 257.109375, + "completions/mean_terminated_length": 257.109375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3410852551460266, + "epoch": 1.3026960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047787409747118745, + "kl": 0.10063477605581284, + "learning_rate": 6.984758723755272e-07, + "loss": 0.001, + "num_tokens": 33553045.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6462818384170532, + "sampling/importance_sampling_ratio/mean": 1.0003209114074707, + "sampling/importance_sampling_ratio/min": 0.415952205657959, + "sampling/sampling_logp_difference/max": 0.8771849274635315, + "sampling/sampling_logp_difference/mean": 0.015459725633263588, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 208.21875, + "completions/mean_terminated_length": 208.21875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.41836845874786377, + "epoch": 1.303921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.852438571334581, + "kl": 0.15046635270118713, + "learning_rate": 6.978218213156044e-07, + "loss": -0.0373, + "num_tokens": 33581507.0, + "reward": 0.4375, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6007379293441772, + "sampling/importance_sampling_ratio/mean": 1.000087022781372, + "sampling/importance_sampling_ratio/min": 0.6114194989204407, + "sampling/sampling_logp_difference/max": 0.4919719696044922, + "sampling/sampling_logp_difference/mean": 0.01780632510781288, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 216.1875, + "completions/mean_terminated_length": 216.1875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.27934664487838745, + "epoch": 1.3051470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3666395790917611, + "kl": 0.125428706407547, + "learning_rate": 6.971673686899169e-07, + "loss": 0.0093, + "num_tokens": 33614175.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5971810817718506, + "sampling/importance_sampling_ratio/mean": 0.9997473359107971, + "sampling/importance_sampling_ratio/min": 0.5324405431747437, + "sampling/sampling_logp_difference/max": 0.6302840709686279, + "sampling/sampling_logp_difference/mean": 0.016285786405205727, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 190.0625, + "completions/mean_terminated_length": 190.0625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3244784474372864, + "epoch": 1.3063725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1379653564633323, + "kl": 0.2047014981508255, + "learning_rate": 6.965125158269618e-07, + "loss": -0.0433, + "num_tokens": 33644195.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6598302125930786, + "sampling/importance_sampling_ratio/mean": 0.9999638199806213, + "sampling/importance_sampling_ratio/min": 0.5362949371337891, + "sampling/sampling_logp_difference/max": 0.6230709552764893, + "sampling/sampling_logp_difference/mean": 0.01563986763358116, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 191.9375, + "completions/mean_terminated_length": 191.9375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.35488128662109375, + "epoch": 1.3075980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5621760628019599, + "kl": 0.14954595267772675, + "learning_rate": 6.958572640560491e-07, + "loss": 0.0138, + "num_tokens": 33679647.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5813579559326172, + "sampling/importance_sampling_ratio/mean": 1.0000293254852295, + "sampling/importance_sampling_ratio/min": 0.6106756329536438, + "sampling/sampling_logp_difference/max": 0.49318933486938477, + "sampling/sampling_logp_difference/mean": 0.01592111587524414, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 179.40625, + "completions/mean_terminated_length": 179.40625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.20306000113487244, + "epoch": 1.3088235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3036170426143063, + "kl": 0.09833575785160065, + "learning_rate": 6.952016147072981e-07, + "loss": 0.0, + "num_tokens": 33706425.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.9271328449249268, + "sampling/importance_sampling_ratio/mean": 0.9996373653411865, + "sampling/importance_sampling_ratio/min": 0.3300285041332245, + "sampling/sampling_logp_difference/max": 1.1085762977600098, + "sampling/sampling_logp_difference/mean": 0.01394584123045206, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 243.359375, + "completions/mean_terminated_length": 243.359375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.405498743057251, + "epoch": 1.3100490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2919334017094637, + "kl": 0.1486983299255371, + "learning_rate": 6.945455691116358e-07, + "loss": 0.01, + "num_tokens": 33740384.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6165982484817505, + "sampling/importance_sampling_ratio/mean": 1.000389814376831, + "sampling/importance_sampling_ratio/min": 0.6166492104530334, + "sampling/sampling_logp_difference/max": 0.48345494270324707, + "sampling/sampling_logp_difference/mean": 0.017727406695485115, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 179.296875, + "completions/mean_terminated_length": 179.296875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2689400613307953, + "epoch": 1.3112745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09351448609747967, + "kl": 0.11540800333023071, + "learning_rate": 6.938891286007928e-07, + "loss": 0.0012, + "num_tokens": 33775987.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5074286460876465, + "sampling/importance_sampling_ratio/mean": 0.9998900890350342, + "sampling/importance_sampling_ratio/min": 0.2538463771343231, + "sampling/sampling_logp_difference/max": 1.3710259199142456, + "sampling/sampling_logp_difference/mean": 0.014276232570409775, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 175.921875, + "completions/mean_terminated_length": 175.921875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2785714268684387, + "epoch": 1.3125, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0695506021075314, + "kl": 0.09059837460517883, + "learning_rate": 6.932322945073023e-07, + "loss": -0.0602, + "num_tokens": 33801374.0, + "reward": -0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6554646492004395, + "sampling/importance_sampling_ratio/mean": 1.0000858306884766, + "sampling/importance_sampling_ratio/min": 0.6298375129699707, + "sampling/sampling_logp_difference/max": 0.5040817260742188, + "sampling/sampling_logp_difference/mean": 0.015221224166452885, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 255.65625, + "completions/mean_terminated_length": 255.65625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.32079508900642395, + "epoch": 1.3137254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6987967605391736, + "kl": 0.12066194415092468, + "learning_rate": 6.925750681644953e-07, + "loss": -0.1204, + "num_tokens": 33833352.0, + "reward": 0.53125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995695948600769, + "sampling/importance_sampling_ratio/min": 0.49790158867836, + "sampling/sampling_logp_difference/max": 0.7732429504394531, + "sampling/sampling_logp_difference/mean": 0.014812313951551914, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 217.5, + "completions/mean_terminated_length": 217.5, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.1903567910194397, + "epoch": 1.3149509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04255177341041909, + "kl": 0.08062325417995453, + "learning_rate": 6.919174509065003e-07, + "loss": 0.0007, + "num_tokens": 33874184.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999676942825317, + "sampling/importance_sampling_ratio/min": 0.5171644687652588, + "sampling/sampling_logp_difference/max": 0.8792343139648438, + "sampling/sampling_logp_difference/mean": 0.012009510770440102, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 257.78125, + "completions/mean_terminated_length": 257.78125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3101387023925781, + "epoch": 1.3161764705882353, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7028719699463302, + "kl": 0.1298815906047821, + "learning_rate": 6.91259444068238e-07, + "loss": -0.0294, + "num_tokens": 33908906.0, + "reward": 0.8125, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.9180221557617188, + "sampling/importance_sampling_ratio/mean": 0.9998555779457092, + "sampling/importance_sampling_ratio/min": 0.49833470582962036, + "sampling/sampling_logp_difference/max": 0.6964833736419678, + "sampling/sampling_logp_difference/mean": 0.0152537040412426, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 309.734375, + "completions/mean_terminated_length": 309.734375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.42819422483444214, + "epoch": 1.3174019607843137, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1147177454245716, + "kl": 0.14049476385116577, + "learning_rate": 6.906010489854209e-07, + "loss": 0.0592, + "num_tokens": 33950809.0, + "reward": 0.21875, + "reward_std": 0.8013203144073486, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6463440656661987, + "sampling/importance_sampling_ratio/mean": 0.9998204708099365, + "sampling/importance_sampling_ratio/min": 0.4735189378261566, + "sampling/sampling_logp_difference/max": 0.747563362121582, + "sampling/sampling_logp_difference/mean": 0.019438711926341057, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3980330526828766, + "epoch": 1.3186274509803921, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8301895289917482, + "kl": 0.14308233559131622, + "learning_rate": 6.899422669945493e-07, + "loss": -0.0192, + "num_tokens": 33985737.0, + "reward": 0.03125, + "reward_std": 0.6970869898796082, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6340501308441162, + "sampling/importance_sampling_ratio/mean": 1.0005192756652832, + "sampling/importance_sampling_ratio/min": 0.4092217683792114, + "sampling/sampling_logp_difference/max": 0.8934980630874634, + "sampling/sampling_logp_difference/mean": 0.01768249273300171, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 177.734375, + "completions/mean_terminated_length": 177.734375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.21226820349693298, + "epoch": 1.3198529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.614710418153234, + "kl": 0.10652191936969757, + "learning_rate": 6.892830994329088e-07, + "loss": -0.0369, + "num_tokens": 34017496.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002856254577637, + "sampling/importance_sampling_ratio/min": 0.5585871934890747, + "sampling/sampling_logp_difference/max": 0.9647364616394043, + "sampling/sampling_logp_difference/mean": 0.012589693069458008, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 285.765625, + "completions/mean_terminated_length": 285.765625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.4327370524406433, + "epoch": 1.321078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3803107836661115, + "kl": 0.15147444605827332, + "learning_rate": 6.886235476385681e-07, + "loss": -0.0133, + "num_tokens": 34053721.0, + "reward": 0.15625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5612167119979858, + "sampling/importance_sampling_ratio/mean": 1.0000461339950562, + "sampling/importance_sampling_ratio/min": 0.4624379277229309, + "sampling/sampling_logp_difference/max": 0.7712429165840149, + "sampling/sampling_logp_difference/mean": 0.018006278201937675, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 182.109375, + "completions/mean_terminated_length": 182.109375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2860751152038574, + "epoch": 1.3223039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07197268825494085, + "kl": 0.15290674567222595, + "learning_rate": 6.879636129503751e-07, + "loss": 0.0014, + "num_tokens": 34085664.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6207294464111328, + "sampling/importance_sampling_ratio/mean": 1.000579595565796, + "sampling/importance_sampling_ratio/min": 0.577124297618866, + "sampling/sampling_logp_difference/max": 0.5496976375579834, + "sampling/sampling_logp_difference/mean": 0.015663743019104004, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 221.53125, + "completions/mean_terminated_length": 221.53125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.28195878863334656, + "epoch": 1.3235294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.605452125919545, + "kl": 0.08582913130521774, + "learning_rate": 6.87303296707956e-07, + "loss": 0.0488, + "num_tokens": 34121666.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995183944702148, + "sampling/importance_sampling_ratio/min": 0.11842865496873856, + "sampling/sampling_logp_difference/max": 2.1334445476531982, + "sampling/sampling_logp_difference/mean": 0.015778496861457825, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 222.203125, + "completions/mean_terminated_length": 222.203125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.32386523485183716, + "epoch": 1.3247549019607843, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.935591694995017, + "kl": 0.10367701202630997, + "learning_rate": 6.866426002517105e-07, + "loss": -0.0449, + "num_tokens": 34149423.0, + "reward": 0.21875, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6926884651184082, + "sampling/importance_sampling_ratio/mean": 1.000137209892273, + "sampling/importance_sampling_ratio/min": 0.5572013854980469, + "sampling/sampling_logp_difference/max": 0.5848284959793091, + "sampling/sampling_logp_difference/mean": 0.014832671731710434, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 174.21875, + "completions/mean_terminated_length": 174.21875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2482524961233139, + "epoch": 1.3259803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9509560925915717, + "kl": 0.10061395913362503, + "learning_rate": 6.859815249228105e-07, + "loss": 0.0079, + "num_tokens": 34177229.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.576332926750183, + "sampling/importance_sampling_ratio/mean": 0.9995920658111572, + "sampling/importance_sampling_ratio/min": 0.5588234663009644, + "sampling/sampling_logp_difference/max": 0.5819215774536133, + "sampling/sampling_logp_difference/mean": 0.01229644101113081, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 187.046875, + "completions/mean_terminated_length": 187.046875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.31346315145492554, + "epoch": 1.3272058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0175494789187907, + "kl": 0.11647707223892212, + "learning_rate": 6.853200720631972e-07, + "loss": 0.0108, + "num_tokens": 34204768.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6359094381332397, + "sampling/importance_sampling_ratio/mean": 1.0006194114685059, + "sampling/importance_sampling_ratio/min": 0.5764164924621582, + "sampling/sampling_logp_difference/max": 0.5509247779846191, + "sampling/sampling_logp_difference/mean": 0.016130639240145683, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 225.890625, + "completions/mean_terminated_length": 225.890625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.33300894498825073, + "epoch": 1.3284313725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7513658062777013, + "kl": 0.11280956864356995, + "learning_rate": 6.846582430155781e-07, + "loss": -0.0062, + "num_tokens": 34234137.0, + "reward": 0.65625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5556352138519287, + "sampling/importance_sampling_ratio/mean": 0.9996647834777832, + "sampling/importance_sampling_ratio/min": 0.5057145953178406, + "sampling/sampling_logp_difference/max": 0.6817827224731445, + "sampling/sampling_logp_difference/mean": 0.016297079622745514, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 172.796875, + "completions/mean_terminated_length": 172.796875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2712753713130951, + "epoch": 1.329656862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7740138265401932, + "kl": 0.12398400157690048, + "learning_rate": 6.839960391234242e-07, + "loss": 0.0476, + "num_tokens": 34258172.0, + "reward": 0.375, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003924369812012, + "sampling/importance_sampling_ratio/min": 0.432391881942749, + "sampling/sampling_logp_difference/max": 0.8384230136871338, + "sampling/sampling_logp_difference/mean": 0.01533106155693531, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 223.65625, + "completions/mean_terminated_length": 223.65625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.28661102056503296, + "epoch": 1.3308823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1567830622447894, + "kl": 0.10410013794898987, + "learning_rate": 6.833334617309672e-07, + "loss": 0.045, + "num_tokens": 34292022.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5311121940612793, + "sampling/importance_sampling_ratio/mean": 1.0001766681671143, + "sampling/importance_sampling_ratio/min": 0.5280182957649231, + "sampling/sampling_logp_difference/max": 0.6386243104934692, + "sampling/sampling_logp_difference/mean": 0.014018706977367401, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 241.109375, + "completions/mean_terminated_length": 241.109375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2740551829338074, + "epoch": 1.3321078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0361179328305387, + "kl": 0.09827157109975815, + "learning_rate": 6.826705121831976e-07, + "loss": 0.0009, + "num_tokens": 34323677.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5972678661346436, + "sampling/importance_sampling_ratio/mean": 0.999886155128479, + "sampling/importance_sampling_ratio/min": 0.643671989440918, + "sampling/sampling_logp_difference/max": 0.468294620513916, + "sampling/sampling_logp_difference/mean": 0.013634300790727139, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 292.15625, + "completions/mean_terminated_length": 292.15625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.33552277088165283, + "epoch": 1.3333333333333333, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4465584966332885, + "kl": 0.09587956964969635, + "learning_rate": 6.820071918258605e-07, + "loss": 0.0061, + "num_tokens": 34361143.0, + "reward": 0.28125, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.921859860420227, + "sampling/importance_sampling_ratio/mean": 1.0002851486206055, + "sampling/importance_sampling_ratio/min": 0.06600768119096756, + "sampling/sampling_logp_difference/max": 2.717984199523926, + "sampling/sampling_logp_difference/mean": 0.0153445303440094, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 178.234375, + "completions/mean_terminated_length": 178.234375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.24344758689403534, + "epoch": 1.3345588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043404515045103384, + "kl": 0.09411592781543732, + "learning_rate": 6.813435020054548e-07, + "loss": 0.0009, + "num_tokens": 34387958.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000336170196533, + "sampling/importance_sampling_ratio/min": 0.610383152961731, + "sampling/sampling_logp_difference/max": 0.8105227947235107, + "sampling/sampling_logp_difference/mean": 0.013158331625163555, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 263.703125, + "completions/mean_terminated_length": 263.703125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.31929922103881836, + "epoch": 1.3357843137254901, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.258805920791015, + "kl": 0.09902656078338623, + "learning_rate": 6.806794440692282e-07, + "loss": -0.0302, + "num_tokens": 34422083.0, + "reward": -0.0625, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.636869192123413, + "sampling/importance_sampling_ratio/mean": 1.0007058382034302, + "sampling/importance_sampling_ratio/min": 0.37227267026901245, + "sampling/sampling_logp_difference/max": 0.9881287813186646, + "sampling/sampling_logp_difference/mean": 0.015133077278733253, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 223.515625, + "completions/mean_terminated_length": 223.515625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2561953067779541, + "epoch": 1.3370098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3665153423660428, + "kl": 0.07908254861831665, + "learning_rate": 6.800150193651767e-07, + "loss": 0.0024, + "num_tokens": 34453124.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.477170467376709, + "sampling/importance_sampling_ratio/mean": 0.999785304069519, + "sampling/importance_sampling_ratio/min": 0.6561281681060791, + "sampling/sampling_logp_difference/max": 0.4213991165161133, + "sampling/sampling_logp_difference/mean": 0.013078063726425171, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 199.59375, + "completions/mean_terminated_length": 199.59375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2496260553598404, + "epoch": 1.3382352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.374085590077728, + "kl": 0.08954588323831558, + "learning_rate": 6.793502292420401e-07, + "loss": -0.008, + "num_tokens": 34481610.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6981291770935059, + "sampling/importance_sampling_ratio/mean": 1.0000662803649902, + "sampling/importance_sampling_ratio/min": 0.6202826499938965, + "sampling/sampling_logp_difference/max": 0.5295271873474121, + "sampling/sampling_logp_difference/mean": 0.013239433988928795, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 250.21875, + "completions/mean_terminated_length": 250.21875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.36997076869010925, + "epoch": 1.3394607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07126523930538059, + "kl": 0.1038190945982933, + "learning_rate": 6.786850750493005e-07, + "loss": 0.001, + "num_tokens": 34515576.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7066212892532349, + "sampling/importance_sampling_ratio/mean": 1.0008347034454346, + "sampling/importance_sampling_ratio/min": 0.6208910346031189, + "sampling/sampling_logp_difference/max": 0.5345156192779541, + "sampling/sampling_logp_difference/mean": 0.01807655580341816, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 160.84375, + "completions/mean_terminated_length": 160.84375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.22015033662319183, + "epoch": 1.340686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046881469980766685, + "kl": 0.07629378885030746, + "learning_rate": 6.780195581371784e-07, + "loss": 0.0008, + "num_tokens": 34539902.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6254730224609375, + "sampling/importance_sampling_ratio/mean": 1.0000666379928589, + "sampling/importance_sampling_ratio/min": 0.6097134351730347, + "sampling/sampling_logp_difference/max": 0.4947662353515625, + "sampling/sampling_logp_difference/mean": 0.013527627103030682, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 220.5, + "completions/mean_terminated_length": 220.5, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.260101318359375, + "epoch": 1.3419117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05144603117378995, + "kl": 0.08464940637350082, + "learning_rate": 6.773536798566313e-07, + "loss": 0.0008, + "num_tokens": 34571550.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002096891403198, + "sampling/importance_sampling_ratio/min": 0.5105252861976624, + "sampling/sampling_logp_difference/max": 0.8242547512054443, + "sampling/sampling_logp_difference/mean": 0.013929269276559353, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 264.0, + "completions/mean_terminated_length": 264.0, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.19407080113887787, + "epoch": 1.343137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7789398781421727, + "kl": 0.060388315469026566, + "learning_rate": 6.766874415593495e-07, + "loss": -0.0081, + "num_tokens": 34605774.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996485710144043, + "sampling/importance_sampling_ratio/min": 0.44851693511009216, + "sampling/sampling_logp_difference/max": 0.8018088340759277, + "sampling/sampling_logp_difference/mean": 0.011173035018146038, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 212.875, + "completions/mean_terminated_length": 212.875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2143286168575287, + "epoch": 1.344362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05273733580224862, + "kl": 0.08921480178833008, + "learning_rate": 6.760208445977549e-07, + "loss": 0.0008, + "num_tokens": 34634022.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6457408666610718, + "sampling/importance_sampling_ratio/mean": 1.000124216079712, + "sampling/importance_sampling_ratio/min": 0.6015896797180176, + "sampling/sampling_logp_difference/max": 0.5081796646118164, + "sampling/sampling_logp_difference/mean": 0.01239019725471735, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 273.8125, + "completions/mean_terminated_length": 273.8125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.30872780084609985, + "epoch": 1.3455882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06237159816268388, + "kl": 0.08186507225036621, + "learning_rate": 6.753538903249974e-07, + "loss": 0.0007, + "num_tokens": 34677674.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003366470336914, + "sampling/importance_sampling_ratio/min": 0.4494738280773163, + "sampling/sampling_logp_difference/max": 0.8946397304534912, + "sampling/sampling_logp_difference/mean": 0.01739206723868847, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 254.3125, + "completions/mean_terminated_length": 254.3125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.31094664335250854, + "epoch": 1.346813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2818491418402742, + "kl": 0.10428554564714432, + "learning_rate": 6.74686580094951e-07, + "loss": 0.01, + "num_tokens": 34711902.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001916885375977, + "sampling/importance_sampling_ratio/min": 0.19906094670295715, + "sampling/sampling_logp_difference/max": 1.614144206047058, + "sampling/sampling_logp_difference/mean": 0.016226403415203094, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 229.203125, + "completions/mean_terminated_length": 229.203125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.38202738761901855, + "epoch": 1.3480392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6089337264588108, + "kl": 0.16241875290870667, + "learning_rate": 6.740189152622142e-07, + "loss": -0.0471, + "num_tokens": 34746091.0, + "reward": -0.125, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995103478431702, + "sampling/importance_sampling_ratio/min": 0.040602829307317734, + "sampling/sampling_logp_difference/max": 3.2039175033569336, + "sampling/sampling_logp_difference/mean": 0.01960323192179203, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 145.5, + "completions/mean_terminated_length": 145.5, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.1822962462902069, + "epoch": 1.3492647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08516365588250555, + "kl": 0.07012251019477844, + "learning_rate": 6.733508971821036e-07, + "loss": 0.0007, + "num_tokens": 34770747.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9991965293884277, + "sampling/importance_sampling_ratio/min": 0.45569705963134766, + "sampling/sampling_logp_difference/max": 0.7859270572662354, + "sampling/sampling_logp_difference/mean": 0.013009462505578995, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 271.703125, + "completions/mean_terminated_length": 271.703125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3267415761947632, + "epoch": 1.3504901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2458060734568923, + "kl": 0.08820392191410065, + "learning_rate": 6.726825272106538e-07, + "loss": 0.0515, + "num_tokens": 34806344.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918341636658, + "sampling/importance_sampling_ratio/min": 0.4708138108253479, + "sampling/sampling_logp_difference/max": 0.7646852731704712, + "sampling/sampling_logp_difference/mean": 0.01696830615401268, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 225.8125, + "completions/mean_terminated_length": 225.8125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.4195988178253174, + "epoch": 1.3517156862745099, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2420206724302587, + "kl": 0.2157381772994995, + "learning_rate": 6.720138067046134e-07, + "loss": 0.023, + "num_tokens": 34837500.0, + "reward": -0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.8745075464248657, + "sampling/importance_sampling_ratio/mean": 0.9999470114707947, + "sampling/importance_sampling_ratio/min": 0.5703800916671753, + "sampling/sampling_logp_difference/max": 0.6283459663391113, + "sampling/sampling_logp_difference/mean": 0.018711727112531662, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 172.265625, + "completions/mean_terminated_length": 172.265625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.20912107825279236, + "epoch": 1.3529411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05031629823568618, + "kl": 0.07606321573257446, + "learning_rate": 6.713447370214431e-07, + "loss": 0.0008, + "num_tokens": 34863357.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5024638175964355, + "sampling/importance_sampling_ratio/mean": 0.9996066689491272, + "sampling/importance_sampling_ratio/min": 0.6483970284461975, + "sampling/sampling_logp_difference/max": 0.43325209617614746, + "sampling/sampling_logp_difference/mean": 0.012016498483717442, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 171.796875, + "completions/mean_terminated_length": 171.796875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.26856446266174316, + "epoch": 1.3541666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4562655301043657, + "kl": 0.09392675757408142, + "learning_rate": 6.706753195193116e-07, + "loss": 0.001, + "num_tokens": 34890096.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.8024160861968994, + "sampling/importance_sampling_ratio/mean": 1.0000146627426147, + "sampling/importance_sampling_ratio/min": 0.6119200587272644, + "sampling/sampling_logp_difference/max": 0.5891280174255371, + "sampling/sampling_logp_difference/mean": 0.015046817250549793, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 190.640625, + "completions/mean_terminated_length": 190.640625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.2494535744190216, + "epoch": 1.3553921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3153675097880058, + "kl": 0.12353288382291794, + "learning_rate": 6.700055555570941e-07, + "loss": -0.0027, + "num_tokens": 34919465.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6230236291885376, + "sampling/importance_sampling_ratio/mean": 1.000239372253418, + "sampling/importance_sampling_ratio/min": 0.513964056968689, + "sampling/sampling_logp_difference/max": 0.665601909160614, + "sampling/sampling_logp_difference/mean": 0.014158805832266808, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 182.578125, + "completions/mean_terminated_length": 182.578125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.20572876930236816, + "epoch": 1.3566176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.245647422311269, + "kl": 0.07359683513641357, + "learning_rate": 6.693354464943688e-07, + "loss": 0.075, + "num_tokens": 34946398.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5465470552444458, + "sampling/importance_sampling_ratio/mean": 0.9993281960487366, + "sampling/importance_sampling_ratio/min": 0.4414757788181305, + "sampling/sampling_logp_difference/max": 0.8176320791244507, + "sampling/sampling_logp_difference/mean": 0.012859884649515152, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 171.046875, + "completions/mean_terminated_length": 171.046875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2153121381998062, + "epoch": 1.357843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044084523082514315, + "kl": 0.08404094725847244, + "learning_rate": 6.68664993691415e-07, + "loss": 0.0008, + "num_tokens": 34979681.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4022732973098755, + "sampling/importance_sampling_ratio/mean": 1.000349760055542, + "sampling/importance_sampling_ratio/min": 0.6051716804504395, + "sampling/sampling_logp_difference/max": 0.5022430419921875, + "sampling/sampling_logp_difference/mean": 0.012641921639442444, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 177.046875, + "completions/mean_terminated_length": 177.046875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2505217492580414, + "epoch": 1.3590686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14886236711394396, + "kl": 0.11417225748300552, + "learning_rate": 6.679941985092092e-07, + "loss": 0.001, + "num_tokens": 35011028.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6209088563919067, + "sampling/importance_sampling_ratio/mean": 0.9995545744895935, + "sampling/importance_sampling_ratio/min": 0.2842315137386322, + "sampling/sampling_logp_difference/max": 1.257966160774231, + "sampling/sampling_logp_difference/mean": 0.01553020253777504, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 227.546875, + "completions/mean_terminated_length": 227.546875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.32394784688949585, + "epoch": 1.3602941176470589, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.899899238781701, + "kl": 0.13107380270957947, + "learning_rate": 6.673230623094231e-07, + "loss": 0.0165, + "num_tokens": 35044311.0, + "reward": 0.28125, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993151426315308, + "sampling/importance_sampling_ratio/min": 0.5468358397483826, + "sampling/sampling_logp_difference/max": 0.8583822250366211, + "sampling/sampling_logp_difference/mean": 0.016252432018518448, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 261.1875, + "completions/mean_terminated_length": 261.1875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.37364470958709717, + "epoch": 1.3615196078431373, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.72268187710298, + "kl": 0.1305997222661972, + "learning_rate": 6.666515864544208e-07, + "loss": -0.0002, + "num_tokens": 35078419.0, + "reward": 0.25, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.7393680810928345, + "sampling/importance_sampling_ratio/mean": 1.0007290840148926, + "sampling/importance_sampling_ratio/min": 0.5912945866584778, + "sampling/sampling_logp_difference/max": 0.5535218715667725, + "sampling/sampling_logp_difference/mean": 0.01725170388817787, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 172.234375, + "completions/mean_terminated_length": 172.234375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.29257479310035706, + "epoch": 1.3627450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0340952596276525, + "kl": 0.1277839094400406, + "learning_rate": 6.659797723072558e-07, + "loss": 0.0129, + "num_tokens": 35109554.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8624109029769897, + "sampling/importance_sampling_ratio/mean": 0.9998859167098999, + "sampling/importance_sampling_ratio/min": 0.4957435131072998, + "sampling/sampling_logp_difference/max": 0.7016966342926025, + "sampling/sampling_logp_difference/mean": 0.01669422537088394, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 244.0, + "completions/mean_terminated_length": 244.0, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2708520293235779, + "epoch": 1.3639705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.063436147267886, + "kl": 0.0937291830778122, + "learning_rate": 6.653076212316681e-07, + "loss": 0.0101, + "num_tokens": 35144738.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8026752471923828, + "sampling/importance_sampling_ratio/mean": 0.999823808670044, + "sampling/importance_sampling_ratio/min": 0.25222671031951904, + "sampling/sampling_logp_difference/max": 1.3774269819259644, + "sampling/sampling_logp_difference/mean": 0.014111516997218132, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 195.015625, + "completions/mean_terminated_length": 195.015625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.22389185428619385, + "epoch": 1.3651960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036642500568826744, + "kl": 0.07788003236055374, + "learning_rate": 6.646351345920818e-07, + "loss": 0.0008, + "num_tokens": 35175907.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7600457668304443, + "sampling/importance_sampling_ratio/mean": 0.999549388885498, + "sampling/importance_sampling_ratio/min": 0.6267169117927551, + "sampling/sampling_logp_difference/max": 0.5653398036956787, + "sampling/sampling_logp_difference/mean": 0.012162875384092331, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 338.09375, + "completions/mean_terminated_length": 338.09375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.28994637727737427, + "epoch": 1.366421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8466283656220367, + "kl": 0.07235518842935562, + "learning_rate": 6.639623137536022e-07, + "loss": -0.0171, + "num_tokens": 35213273.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006005764007568, + "sampling/importance_sampling_ratio/min": 0.5194491147994995, + "sampling/sampling_logp_difference/max": 0.8757719993591309, + "sampling/sampling_logp_difference/mean": 0.014356923289597034, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 239.265625, + "completions/mean_terminated_length": 239.265625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2334747612476349, + "epoch": 1.3676470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1310088388813992, + "kl": 0.08175957947969437, + "learning_rate": 6.63289160082013e-07, + "loss": -0.0167, + "num_tokens": 35243978.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6492300033569336, + "sampling/importance_sampling_ratio/mean": 1.0003918409347534, + "sampling/importance_sampling_ratio/min": 0.4857369661331177, + "sampling/sampling_logp_difference/max": 0.7220879793167114, + "sampling/sampling_logp_difference/mean": 0.01374032348394394, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 204.953125, + "completions/mean_terminated_length": 204.953125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3001249134540558, + "epoch": 1.3688725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08509560883550144, + "kl": 0.09763385355472565, + "learning_rate": 6.626156749437736e-07, + "loss": 0.001, + "num_tokens": 35274695.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9560965299606323, + "sampling/importance_sampling_ratio/mean": 1.0007350444793701, + "sampling/importance_sampling_ratio/min": 0.519588828086853, + "sampling/sampling_logp_difference/max": 0.6709508895874023, + "sampling/sampling_logp_difference/mean": 0.015779858455061913, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 223.546875, + "completions/mean_terminated_length": 223.546875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2166321575641632, + "epoch": 1.3700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9519877488685602, + "kl": 0.07133737206459045, + "learning_rate": 6.619418597060159e-07, + "loss": 0.0072, + "num_tokens": 35306410.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9024752378463745, + "sampling/importance_sampling_ratio/mean": 0.9997768998146057, + "sampling/importance_sampling_ratio/min": 0.6491274833679199, + "sampling/sampling_logp_difference/max": 0.6431558132171631, + "sampling/sampling_logp_difference/mean": 0.011922700330615044, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 177.921875, + "completions/mean_terminated_length": 177.921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.23126693069934845, + "epoch": 1.3713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42967752946002885, + "kl": 0.10150554031133652, + "learning_rate": 6.612677157365425e-07, + "loss": 0.001, + "num_tokens": 35334837.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5501295328140259, + "sampling/importance_sampling_ratio/mean": 1.000143051147461, + "sampling/importance_sampling_ratio/min": 0.4596952497959137, + "sampling/sampling_logp_difference/max": 0.7771915197372437, + "sampling/sampling_logp_difference/mean": 0.01388323213905096, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 242.703125, + "completions/mean_terminated_length": 242.703125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.3640415072441101, + "epoch": 1.3725490196078431, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7909021871087463, + "kl": 0.1598997712135315, + "learning_rate": 6.605932444038228e-07, + "loss": -0.0379, + "num_tokens": 35366578.0, + "reward": 0.46875, + "reward_std": 0.5431214570999146, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001885890960693, + "sampling/importance_sampling_ratio/min": 0.5097939968109131, + "sampling/sampling_logp_difference/max": 1.179419755935669, + "sampling/sampling_logp_difference/mean": 0.01787789911031723, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 205.78125, + "completions/mean_terminated_length": 205.78125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.31297239661216736, + "epoch": 1.3737745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06916975114202925, + "kl": 0.11701078712940216, + "learning_rate": 6.599184470769908e-07, + "loss": 0.0012, + "num_tokens": 35394180.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8037598133087158, + "sampling/importance_sampling_ratio/mean": 0.9997196197509766, + "sampling/importance_sampling_ratio/min": 0.535176694393158, + "sampling/sampling_logp_difference/max": 0.6251583099365234, + "sampling/sampling_logp_difference/mean": 0.016619134694337845, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 245.59375, + "completions/mean_terminated_length": 245.59375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.24175047874450684, + "epoch": 1.375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04624234806036233, + "kl": 0.05887269228696823, + "learning_rate": 6.592433251258422e-07, + "loss": 0.0006, + "num_tokens": 35432090.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.989425539970398, + "sampling/importance_sampling_ratio/mean": 0.9999043941497803, + "sampling/importance_sampling_ratio/min": 0.6298782229423523, + "sampling/sampling_logp_difference/max": 0.6878459453582764, + "sampling/sampling_logp_difference/mean": 0.012914846651256084, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 233.3125, + "completions/mean_terminated_length": 233.3125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.23181766271591187, + "epoch": 1.3762254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058806750135720655, + "kl": 0.08856362104415894, + "learning_rate": 6.58567879920832e-07, + "loss": 0.0009, + "num_tokens": 35464734.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995250105857849, + "sampling/importance_sampling_ratio/min": 0.2612076997756958, + "sampling/sampling_logp_difference/max": 1.3424394130706787, + "sampling/sampling_logp_difference/mean": 0.014123495668172836, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 215.15625, + "completions/mean_terminated_length": 215.15625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.24334780871868134, + "epoch": 1.3774509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4454882512237588, + "kl": 0.10482418537139893, + "learning_rate": 6.578921128330714e-07, + "loss": 0.0109, + "num_tokens": 35492376.0, + "reward": 0.59375, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5745593309402466, + "sampling/importance_sampling_ratio/mean": 0.9996858239173889, + "sampling/importance_sampling_ratio/min": 0.5260617733001709, + "sampling/sampling_logp_difference/max": 0.6423366069793701, + "sampling/sampling_logp_difference/mean": 0.012269468046724796, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 189.109375, + "completions/mean_terminated_length": 189.109375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.29685068130493164, + "epoch": 1.3786764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08161177455666314, + "kl": 0.12493439018726349, + "learning_rate": 6.572160252343242e-07, + "loss": 0.0013, + "num_tokens": 35525247.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7915161848068237, + "sampling/importance_sampling_ratio/mean": 0.999591588973999, + "sampling/importance_sampling_ratio/min": 0.24142293632030487, + "sampling/sampling_logp_difference/max": 1.4212050437927246, + "sampling/sampling_logp_difference/mean": 0.018257491290569305, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 217.734375, + "completions/mean_terminated_length": 217.734375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3483741879463196, + "epoch": 1.3799019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4250601586537104, + "kl": 0.20661494135856628, + "learning_rate": 6.565396184970059e-07, + "loss": -0.0062, + "num_tokens": 35559326.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.60890531539917, + "sampling/importance_sampling_ratio/mean": 0.9998926520347595, + "sampling/importance_sampling_ratio/min": 0.5127992630004883, + "sampling/sampling_logp_difference/max": 0.6678707599639893, + "sampling/sampling_logp_difference/mean": 0.017867445945739746, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 321.0625, + "completions/mean_terminated_length": 321.0625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.23988881707191467, + "epoch": 1.3811274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04530622749224077, + "kl": 0.06789304316043854, + "learning_rate": 6.558628939941791e-07, + "loss": 0.0006, + "num_tokens": 35602274.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7737213373184204, + "sampling/importance_sampling_ratio/mean": 0.999636709690094, + "sampling/importance_sampling_ratio/min": 0.5437178015708923, + "sampling/sampling_logp_difference/max": 0.6093249320983887, + "sampling/sampling_logp_difference/mean": 0.013073192909359932, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 248.75, + "completions/mean_terminated_length": 248.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.3969167172908783, + "epoch": 1.3823529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1137259934363668, + "kl": 0.11824201047420502, + "learning_rate": 6.551858530995517e-07, + "loss": -0.0029, + "num_tokens": 35638738.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6356092691421509, + "sampling/importance_sampling_ratio/mean": 1.000152349472046, + "sampling/importance_sampling_ratio/min": 0.6223164200782776, + "sampling/sampling_logp_difference/max": 0.49201536178588867, + "sampling/sampling_logp_difference/mean": 0.016793951392173767, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 268.203125, + "completions/mean_terminated_length": 268.203125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3699800372123718, + "epoch": 1.383578431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9941134239924921, + "kl": 0.10607214272022247, + "learning_rate": 6.545084971874736e-07, + "loss": -0.0054, + "num_tokens": 35676591.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5631024837493896, + "sampling/importance_sampling_ratio/mean": 0.9998665452003479, + "sampling/importance_sampling_ratio/min": 0.5544012188911438, + "sampling/sampling_logp_difference/max": 0.5898666381835938, + "sampling/sampling_logp_difference/mean": 0.018597232177853584, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 310.046875, + "completions/mean_terminated_length": 310.046875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2574818730354309, + "epoch": 1.3848039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8061084669900063, + "kl": 0.06990421563386917, + "learning_rate": 6.538308276329349e-07, + "loss": 0.0002, + "num_tokens": 35716466.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6064001321792603, + "sampling/importance_sampling_ratio/mean": 0.9998332262039185, + "sampling/importance_sampling_ratio/min": 0.6554578542709351, + "sampling/sampling_logp_difference/max": 0.4739956855773926, + "sampling/sampling_logp_difference/mean": 0.01295868493616581, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 183.234375, + "completions/mean_terminated_length": 183.234375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2953099012374878, + "epoch": 1.3860294117647058, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.883785990579241, + "kl": 0.1907866895198822, + "learning_rate": 6.531528458115614e-07, + "loss": -0.0226, + "num_tokens": 35743777.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000083446502686, + "sampling/importance_sampling_ratio/min": 0.5334725975990295, + "sampling/sampling_logp_difference/max": 0.843454122543335, + "sampling/sampling_logp_difference/mean": 0.01571395993232727, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 253.359375, + "completions/mean_terminated_length": 253.359375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2790994346141815, + "epoch": 1.3872549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0671149126434625, + "kl": 0.08887787163257599, + "learning_rate": 6.524745530996136e-07, + "loss": 0.0022, + "num_tokens": 35778600.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001745223999023, + "sampling/importance_sampling_ratio/min": 0.26177510619163513, + "sampling/sampling_logp_difference/max": 1.3402695655822754, + "sampling/sampling_logp_difference/mean": 0.01448032446205616, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 242.203125, + "completions/mean_terminated_length": 242.203125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.2669624090194702, + "epoch": 1.3884803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2722808135566526, + "kl": 0.0965794250369072, + "learning_rate": 6.517959508739825e-07, + "loss": -0.0085, + "num_tokens": 35812917.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.7493369579315186, + "sampling/importance_sampling_ratio/mean": 1.0002140998840332, + "sampling/importance_sampling_ratio/min": 0.6316691637039185, + "sampling/sampling_logp_difference/max": 0.5592367649078369, + "sampling/sampling_logp_difference/mean": 0.013571933843195438, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 304.609375, + "completions/mean_terminated_length": 304.609375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.2814604341983795, + "epoch": 1.3897058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4215723394233764, + "kl": 0.105937659740448, + "learning_rate": 6.511170405121877e-07, + "loss": 0.0101, + "num_tokens": 35849356.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.864237904548645, + "sampling/importance_sampling_ratio/mean": 1.0003492832183838, + "sampling/importance_sampling_ratio/min": 0.4562400281429291, + "sampling/sampling_logp_difference/max": 0.7847362756729126, + "sampling/sampling_logp_difference/mean": 0.014745076186954975, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 189.84375, + "completions/mean_terminated_length": 189.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2530762851238251, + "epoch": 1.3909313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3020371185266197, + "kl": 0.15191222727298737, + "learning_rate": 6.504378233923742e-07, + "loss": -0.0035, + "num_tokens": 35876066.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5546691417694092, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 0.5129217505455017, + "sampling/sampling_logp_difference/max": 0.667631983757019, + "sampling/sampling_logp_difference/mean": 0.014895346947014332, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 214.125, + "completions/mean_terminated_length": 214.125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3823487162590027, + "epoch": 1.392156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2954510462144206, + "kl": 0.16310438513755798, + "learning_rate": 6.497583008933097e-07, + "loss": 0.038, + "num_tokens": 35907866.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005600452423096, + "sampling/importance_sampling_ratio/min": 0.3722725808620453, + "sampling/sampling_logp_difference/max": 0.9881290197372437, + "sampling/sampling_logp_difference/mean": 0.018685853108763695, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 255.34375, + "completions/mean_terminated_length": 255.34375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.20661571621894836, + "epoch": 1.3933823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07365484221000322, + "kl": 0.09507159888744354, + "learning_rate": 6.490784743943818e-07, + "loss": 0.0009, + "num_tokens": 35939536.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.690643072128296, + "sampling/importance_sampling_ratio/mean": 0.9999591112136841, + "sampling/importance_sampling_ratio/min": 0.5260617733001709, + "sampling/sampling_logp_difference/max": 0.6423366069793701, + "sampling/sampling_logp_difference/mean": 0.012394268065690994, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 279.59375, + "completions/mean_terminated_length": 279.59375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.366366446018219, + "epoch": 1.3946078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.015971293057811, + "kl": 0.11405825614929199, + "learning_rate": 6.483983452755952e-07, + "loss": 0.0197, + "num_tokens": 35979478.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000872611999512, + "sampling/importance_sampling_ratio/min": 0.37959447503089905, + "sampling/sampling_logp_difference/max": 0.9686517715454102, + "sampling/sampling_logp_difference/mean": 0.016720902174711227, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 308.359375, + "completions/mean_terminated_length": 308.359375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.29545682668685913, + "epoch": 1.3958333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2149821217737968, + "kl": 0.11053871363401413, + "learning_rate": 6.477179149175692e-07, + "loss": 0.0248, + "num_tokens": 36021405.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.8086117506027222, + "sampling/importance_sampling_ratio/mean": 1.0003973245620728, + "sampling/importance_sampling_ratio/min": 0.5164161920547485, + "sampling/sampling_logp_difference/max": 0.6608422994613647, + "sampling/sampling_logp_difference/mean": 0.014341693371534348, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 251.453125, + "completions/mean_terminated_length": 251.453125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.38097473978996277, + "epoch": 1.3970588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2981838917711737, + "kl": 0.17838437855243683, + "learning_rate": 6.470371847015341e-07, + "loss": 0.0791, + "num_tokens": 36059386.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994637966156006, + "sampling/importance_sampling_ratio/min": 0.43644192814826965, + "sampling/sampling_logp_difference/max": 0.9823462963104248, + "sampling/sampling_logp_difference/mean": 0.01879509910941124, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 240.328125, + "completions/mean_terminated_length": 240.328125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.2859255075454712, + "epoch": 1.3982843137254901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05652274558298719, + "kl": 0.0896296501159668, + "learning_rate": 6.463561560093292e-07, + "loss": 0.0009, + "num_tokens": 36093951.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5874453783035278, + "sampling/importance_sampling_ratio/mean": 0.9997658729553223, + "sampling/importance_sampling_ratio/min": 0.5317541360855103, + "sampling/sampling_logp_difference/max": 0.6315741539001465, + "sampling/sampling_logp_difference/mean": 0.015782658010721207, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 381.265625, + "completions/mean_terminated_length": 381.265625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.31567683815956116, + "epoch": 1.3995098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1183489133905145, + "kl": 0.07916092127561569, + "learning_rate": 6.456748302233994e-07, + "loss": 0.0091, + "num_tokens": 36135840.0, + "reward": 0.46875, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5744246244430542, + "sampling/importance_sampling_ratio/mean": 0.9998644590377808, + "sampling/importance_sampling_ratio/min": 0.482060045003891, + "sampling/sampling_logp_difference/max": 0.7296866178512573, + "sampling/sampling_logp_difference/mean": 0.01383579894900322, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 194.0, + "completions/mean_terminated_length": 194.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.293121874332428, + "epoch": 1.4007352941176472, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.930311948823916, + "kl": 0.15231949090957642, + "learning_rate": 6.449932087267931e-07, + "loss": -0.0106, + "num_tokens": 36164016.0, + "reward": -0.3125, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997024536132812, + "sampling/importance_sampling_ratio/min": 0.5002604126930237, + "sampling/sampling_logp_difference/max": 0.7171361446380615, + "sampling/sampling_logp_difference/mean": 0.016127631068229675, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 234.4375, + "completions/mean_terminated_length": 234.4375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.22176510095596313, + "epoch": 1.4019607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035660544051021835, + "kl": 0.06697164475917816, + "learning_rate": 6.443112929031586e-07, + "loss": 0.0006, + "num_tokens": 36193820.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6496421098709106, + "sampling/importance_sampling_ratio/mean": 1.0004054307937622, + "sampling/importance_sampling_ratio/min": 0.5589194297790527, + "sampling/sampling_logp_difference/max": 0.5817499160766602, + "sampling/sampling_logp_difference/mean": 0.012055369094014168, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 224.78125, + "completions/mean_terminated_length": 224.78125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3289373517036438, + "epoch": 1.403186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.975299659007041, + "kl": 0.13227379322052002, + "learning_rate": 6.43629084136742e-07, + "loss": -0.0033, + "num_tokens": 36225614.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004262924194336, + "sampling/importance_sampling_ratio/min": 0.5649482607841492, + "sampling/sampling_logp_difference/max": 0.8808517456054688, + "sampling/sampling_logp_difference/mean": 0.01591620221734047, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 260.71875, + "completions/mean_terminated_length": 260.71875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.287993848323822, + "epoch": 1.4044117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9604651256840849, + "kl": 0.1282859891653061, + "learning_rate": 6.429465838123838e-07, + "loss": -0.0091, + "num_tokens": 36259596.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.8644553422927856, + "sampling/importance_sampling_ratio/mean": 1.000173807144165, + "sampling/importance_sampling_ratio/min": 0.4801802635192871, + "sampling/sampling_logp_difference/max": 0.7335937023162842, + "sampling/sampling_logp_difference/mean": 0.015946825966238976, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 285.234375, + "completions/mean_terminated_length": 285.234375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.2686070203781128, + "epoch": 1.405637254901961, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6657734507791657, + "kl": 0.10684768855571747, + "learning_rate": 6.422637933155162e-07, + "loss": 0.0146, + "num_tokens": 36296059.0, + "reward": 0.40625, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.7520718574523926, + "sampling/importance_sampling_ratio/mean": 0.9999882578849792, + "sampling/importance_sampling_ratio/min": 0.5483993291854858, + "sampling/sampling_logp_difference/max": 0.600751519203186, + "sampling/sampling_logp_difference/mean": 0.014088155701756477, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 188.375, + "completions/mean_terminated_length": 188.375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.19562289118766785, + "epoch": 1.406862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05258491210013407, + "kl": 0.08688782900571823, + "learning_rate": 6.41580714032161e-07, + "loss": 0.0009, + "num_tokens": 36321523.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7471122741699219, + "sampling/importance_sampling_ratio/mean": 0.9997127056121826, + "sampling/importance_sampling_ratio/min": 0.2428247034549713, + "sampling/sampling_logp_difference/max": 1.4154155254364014, + "sampling/sampling_logp_difference/mean": 0.013245074078440666, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 206.28125, + "completions/mean_terminated_length": 206.28125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.2681342363357544, + "epoch": 1.4080882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06443269592493465, + "kl": 0.1535661518573761, + "learning_rate": 6.408973473489257e-07, + "loss": 0.0015, + "num_tokens": 36350437.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5772086381912231, + "sampling/importance_sampling_ratio/mean": 0.9998694658279419, + "sampling/importance_sampling_ratio/min": 0.48595550656318665, + "sampling/sampling_logp_difference/max": 0.7216382026672363, + "sampling/sampling_logp_difference/mean": 0.015651695430278778, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 243.515625, + "completions/mean_terminated_length": 243.515625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3394845426082611, + "epoch": 1.409313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2212874294737475, + "kl": 0.10652216523885727, + "learning_rate": 6.402136946530014e-07, + "loss": 0.007, + "num_tokens": 36386086.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.8720457553863525, + "sampling/importance_sampling_ratio/mean": 1.0006232261657715, + "sampling/importance_sampling_ratio/min": 0.5518118143081665, + "sampling/sampling_logp_difference/max": 0.6270318031311035, + "sampling/sampling_logp_difference/mean": 0.018039550632238388, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 268.421875, + "completions/mean_terminated_length": 268.421875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.35418879985809326, + "epoch": 1.4105392156862746, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.071230277672197, + "kl": 0.1232588142156601, + "learning_rate": 6.395297573321597e-07, + "loss": 0.0907, + "num_tokens": 36418801.0, + "reward": 0.875, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6880244016647339, + "sampling/importance_sampling_ratio/mean": 0.9997165203094482, + "sampling/importance_sampling_ratio/min": 0.6164071559906006, + "sampling/sampling_logp_difference/max": 0.5235588550567627, + "sampling/sampling_logp_difference/mean": 0.015983447432518005, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 300.796875, + "completions/mean_terminated_length": 300.796875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.31910884380340576, + "epoch": 1.4117647058823528, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7287904445096636, + "kl": 0.11793461441993713, + "learning_rate": 6.388455367747502e-07, + "loss": 0.0234, + "num_tokens": 36456868.0, + "reward": 0.0625, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995290040969849, + "sampling/importance_sampling_ratio/min": 0.5910096764564514, + "sampling/sampling_logp_difference/max": 1.1088428497314453, + "sampling/sampling_logp_difference/mean": 0.014415017329156399, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.3419966697692871, + "epoch": 1.4129901960784315, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0424876632302857, + "kl": 0.19268451631069183, + "learning_rate": 6.38161034369697e-07, + "loss": -0.125, + "num_tokens": 36491668.0, + "reward": 0.3125, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.7639833688735962, + "sampling/importance_sampling_ratio/mean": 1.0002565383911133, + "sampling/importance_sampling_ratio/min": 0.6652889251708984, + "sampling/sampling_logp_difference/max": 0.5675745010375977, + "sampling/sampling_logp_difference/mean": 0.01583666168153286, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 250.75, + "completions/mean_terminated_length": 250.75, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.32501575350761414, + "epoch": 1.4142156862745099, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8449033563929855, + "kl": 0.0944858193397522, + "learning_rate": 6.37476251506497e-07, + "loss": 0.0228, + "num_tokens": 36523748.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6479363441467285, + "sampling/importance_sampling_ratio/mean": 0.9998430013656616, + "sampling/importance_sampling_ratio/min": 0.4959690570831299, + "sampling/sampling_logp_difference/max": 0.7012417316436768, + "sampling/sampling_logp_difference/mean": 0.015257453545928001, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 229.765625, + "completions/mean_terminated_length": 229.765625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.37603646516799927, + "epoch": 1.4154411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3063201336657813, + "kl": 0.15218672156333923, + "learning_rate": 6.367911895752158e-07, + "loss": -0.0294, + "num_tokens": 36559605.0, + "reward": 0.5, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5568087100982666, + "sampling/importance_sampling_ratio/mean": 1.0006072521209717, + "sampling/importance_sampling_ratio/min": 0.523182213306427, + "sampling/sampling_logp_difference/max": 0.6478254795074463, + "sampling/sampling_logp_difference/mean": 0.0173235684633255, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 238.703125, + "completions/mean_terminated_length": 238.703125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3450971245765686, + "epoch": 1.4166666666666667, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5653244022543134, + "kl": 0.11214041709899902, + "learning_rate": 6.361058499664855e-07, + "loss": -0.0061, + "num_tokens": 36595314.0, + "reward": 0.03125, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.7519925832748413, + "sampling/importance_sampling_ratio/mean": 1.0004363059997559, + "sampling/importance_sampling_ratio/min": 0.44848787784576416, + "sampling/sampling_logp_difference/max": 0.8018736839294434, + "sampling/sampling_logp_difference/mean": 0.015167295932769775, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 204.375, + "completions/mean_terminated_length": 204.375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.2972475290298462, + "epoch": 1.4178921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06121943003613818, + "kl": 0.12594039738178253, + "learning_rate": 6.354202340715026e-07, + "loss": 0.0012, + "num_tokens": 36627306.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5914101600646973, + "sampling/importance_sampling_ratio/mean": 0.9999909400939941, + "sampling/importance_sampling_ratio/min": 0.4500995874404907, + "sampling/sampling_logp_difference/max": 0.7982864379882812, + "sampling/sampling_logp_difference/mean": 0.015716087073087692, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 263.578125, + "completions/mean_terminated_length": 263.578125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3475438952445984, + "epoch": 1.4191176470588236, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6374156335189483, + "kl": 0.13400861620903015, + "learning_rate": 6.347343432820234e-07, + "loss": 0.0278, + "num_tokens": 36664527.0, + "reward": 0.4375, + "reward_std": 0.5501632690429688, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.586793303489685, + "sampling/importance_sampling_ratio/mean": 0.9999744892120361, + "sampling/importance_sampling_ratio/min": 0.6635606288909912, + "sampling/sampling_logp_difference/max": 0.4617152214050293, + "sampling/sampling_logp_difference/mean": 0.015900876373052597, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 184.1875, + "completions/mean_terminated_length": 184.1875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.28351420164108276, + "epoch": 1.420343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07877137010457892, + "kl": 0.11289854347705841, + "learning_rate": 6.340481789903634e-07, + "loss": 0.0011, + "num_tokens": 36700043.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.738796353340149, + "sampling/importance_sampling_ratio/mean": 1.0004609823226929, + "sampling/importance_sampling_ratio/min": 0.5671127438545227, + "sampling/sampling_logp_difference/max": 0.5671970844268799, + "sampling/sampling_logp_difference/mean": 0.01656191609799862, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 266.890625, + "completions/mean_terminated_length": 266.890625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.37652432918548584, + "epoch": 1.4215686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.14840458178793, + "kl": 0.12098322808742523, + "learning_rate": 6.333617425893919e-07, + "loss": -0.038, + "num_tokens": 36733124.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5277866125106812, + "sampling/importance_sampling_ratio/mean": 1.0003337860107422, + "sampling/importance_sampling_ratio/min": 0.6173402667045593, + "sampling/sampling_logp_difference/max": 0.48233485221862793, + "sampling/sampling_logp_difference/mean": 0.016561204567551613, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 222.703125, + "completions/mean_terminated_length": 222.703125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.2620255649089813, + "epoch": 1.4227941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1591485140148, + "kl": 0.09756740927696228, + "learning_rate": 6.326750354725319e-07, + "loss": -0.0047, + "num_tokens": 36765137.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.821160078048706, + "sampling/importance_sampling_ratio/mean": 1.0002214908599854, + "sampling/importance_sampling_ratio/min": 0.5200066566467285, + "sampling/sampling_logp_difference/max": 0.6539137363433838, + "sampling/sampling_logp_difference/mean": 0.014374290592968464, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 223.015625, + "completions/mean_terminated_length": 223.015625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.36535540223121643, + "epoch": 1.4240196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7028083486710426, + "kl": 0.13686351478099823, + "learning_rate": 6.319880590337548e-07, + "loss": 0.0281, + "num_tokens": 36798162.0, + "reward": 0.28125, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.47568678855896, + "sampling/importance_sampling_ratio/mean": 1.0003669261932373, + "sampling/importance_sampling_ratio/min": 0.5781504511833191, + "sampling/sampling_logp_difference/max": 0.5479211807250977, + "sampling/sampling_logp_difference/mean": 0.0174210574477911, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 169.453125, + "completions/mean_terminated_length": 169.453125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.25308871269226074, + "epoch": 1.4252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15882293046274884, + "kl": 0.15826408565044403, + "learning_rate": 6.313008146675799e-07, + "loss": 0.0016, + "num_tokens": 36828239.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6306155920028687, + "sampling/importance_sampling_ratio/mean": 1.0002827644348145, + "sampling/importance_sampling_ratio/min": 0.5011641383171082, + "sampling/sampling_logp_difference/max": 0.690821647644043, + "sampling/sampling_logp_difference/mean": 0.01709289662539959, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 229.296875, + "completions/mean_terminated_length": 229.296875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.28702783584594727, + "epoch": 1.4264705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.211984767247007, + "kl": 0.08903306722640991, + "learning_rate": 6.306133037690692e-07, + "loss": -0.0355, + "num_tokens": 36860994.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6364332437515259, + "sampling/importance_sampling_ratio/mean": 0.9997914433479309, + "sampling/importance_sampling_ratio/min": 0.5128509998321533, + "sampling/sampling_logp_difference/max": 0.6677699089050293, + "sampling/sampling_logp_difference/mean": 0.014962265267968178, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 205.46875, + "completions/mean_terminated_length": 205.46875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.18544542789459229, + "epoch": 1.4276960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3131834377855973, + "kl": 0.0759623795747757, + "learning_rate": 6.299255277338264e-07, + "loss": 0.0247, + "num_tokens": 36893152.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6589863300323486, + "sampling/importance_sampling_ratio/mean": 0.9993994235992432, + "sampling/importance_sampling_ratio/min": 0.548751711845398, + "sampling/sampling_logp_difference/max": 0.6001091003417969, + "sampling/sampling_logp_difference/mean": 0.011164880357682705, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 217.21875, + "completions/mean_terminated_length": 217.21875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.27909189462661743, + "epoch": 1.428921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4916533217815588, + "kl": 0.09997544437646866, + "learning_rate": 6.292374879579934e-07, + "loss": 0.0004, + "num_tokens": 36921262.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6306602954864502, + "sampling/importance_sampling_ratio/mean": 0.9999811053276062, + "sampling/importance_sampling_ratio/min": 0.5853133797645569, + "sampling/sampling_logp_difference/max": 0.5356078147888184, + "sampling/sampling_logp_difference/mean": 0.013811073265969753, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 229.21875, + "completions/mean_terminated_length": 229.21875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3222309947013855, + "epoch": 1.4301470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6905276313493822, + "kl": 0.1145516037940979, + "learning_rate": 6.285491858382473e-07, + "loss": -0.0476, + "num_tokens": 36955468.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7082451581954956, + "sampling/importance_sampling_ratio/mean": 0.9996176958084106, + "sampling/importance_sampling_ratio/min": 0.5860161781311035, + "sampling/sampling_logp_difference/max": 0.5354666709899902, + "sampling/sampling_logp_difference/mean": 0.014726577326655388, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 242.796875, + "completions/mean_terminated_length": 242.796875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3567851185798645, + "epoch": 1.4313725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0993007660511787, + "kl": 0.11437512934207916, + "learning_rate": 6.278606227717978e-07, + "loss": 0.0381, + "num_tokens": 36993039.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.602103590965271, + "sampling/importance_sampling_ratio/mean": 0.9991492033004761, + "sampling/importance_sampling_ratio/min": 0.6059837341308594, + "sampling/sampling_logp_difference/max": 0.5009021759033203, + "sampling/sampling_logp_difference/mean": 0.017158126458525658, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 240.875, + "completions/mean_terminated_length": 240.875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.340457022190094, + "epoch": 1.4325980392156863, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2564928648237985, + "kl": 0.1473807990550995, + "learning_rate": 6.271718001563843e-07, + "loss": -0.0095, + "num_tokens": 37024791.0, + "reward": 0.375, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998472929000854, + "sampling/importance_sampling_ratio/min": 0.48082515597343445, + "sampling/sampling_logp_difference/max": 1.5006871223449707, + "sampling/sampling_logp_difference/mean": 0.01723838783800602, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 264.65625, + "completions/mean_terminated_length": 264.65625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.35243093967437744, + "epoch": 1.4338235294117647, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1063927522456454, + "kl": 0.1826269030570984, + "learning_rate": 6.264827193902731e-07, + "loss": 0.0448, + "num_tokens": 37063825.0, + "reward": 0.65625, + "reward_std": 0.7129635810852051, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5744361877441406, + "sampling/importance_sampling_ratio/mean": 0.999392032623291, + "sampling/importance_sampling_ratio/min": 0.5328908562660217, + "sampling/sampling_logp_difference/max": 0.6294386386871338, + "sampling/sampling_logp_difference/mean": 0.016563165932893753, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 221.6875, + "completions/mean_terminated_length": 221.6875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3303709626197815, + "epoch": 1.4350490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.224968075553671, + "kl": 0.1414005309343338, + "learning_rate": 6.257933818722542e-07, + "loss": -0.0035, + "num_tokens": 37098269.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6598443984985352, + "sampling/importance_sampling_ratio/mean": 1.0001469850540161, + "sampling/importance_sampling_ratio/min": 0.6217727065086365, + "sampling/sampling_logp_difference/max": 0.5067238807678223, + "sampling/sampling_logp_difference/mean": 0.01730150170624256, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 212.984375, + "completions/mean_terminated_length": 212.984375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3266046941280365, + "epoch": 1.4362745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7800707386255215, + "kl": 0.11052196472883224, + "learning_rate": 6.251037890016395e-07, + "loss": -0.0443, + "num_tokens": 37127308.0, + "reward": 0.28125, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.988501787185669, + "sampling/importance_sampling_ratio/mean": 0.99956214427948, + "sampling/importance_sampling_ratio/min": 0.46266159415245056, + "sampling/sampling_logp_difference/max": 0.7707594037055969, + "sampling/sampling_logp_difference/mean": 0.016007719561457634, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 207.984375, + "completions/mean_terminated_length": 207.984375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.3158145248889923, + "epoch": 1.4375, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2159819180519693, + "kl": 0.14949479699134827, + "learning_rate": 6.244139421782587e-07, + "loss": 0.0189, + "num_tokens": 37154363.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6023591756820679, + "sampling/importance_sampling_ratio/mean": 0.9998407959938049, + "sampling/importance_sampling_ratio/min": 0.6270001530647278, + "sampling/sampling_logp_difference/max": 0.47147703170776367, + "sampling/sampling_logp_difference/mean": 0.014544149860739708, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 247.484375, + "completions/mean_terminated_length": 247.484375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2983068823814392, + "epoch": 1.4387254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.337779453550791, + "kl": 0.0902748703956604, + "learning_rate": 6.237238428024571e-07, + "loss": -0.0021, + "num_tokens": 37189466.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.663038730621338, + "sampling/importance_sampling_ratio/mean": 1.0002652406692505, + "sampling/importance_sampling_ratio/min": 0.6163129210472107, + "sampling/sampling_logp_difference/max": 0.5086464881896973, + "sampling/sampling_logp_difference/mean": 0.015390568412840366, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 180.3125, + "completions/mean_terminated_length": 180.3125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.25251686573028564, + "epoch": 1.4399509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.211544059427017, + "kl": 0.15830758213996887, + "learning_rate": 6.230334922750929e-07, + "loss": -0.127, + "num_tokens": 37215150.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6885360479354858, + "sampling/importance_sampling_ratio/mean": 0.99983811378479, + "sampling/importance_sampling_ratio/min": 0.46046993136405945, + "sampling/sampling_logp_difference/max": 0.7755076885223389, + "sampling/sampling_logp_difference/mean": 0.013846715912222862, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 201.40625, + "completions/mean_terminated_length": 201.40625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.2913873493671417, + "epoch": 1.4411764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4796317158140682, + "kl": 0.13480722904205322, + "learning_rate": 6.223428919975338e-07, + "loss": 0.0152, + "num_tokens": 37246936.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5692412853240967, + "sampling/importance_sampling_ratio/mean": 1.000030755996704, + "sampling/importance_sampling_ratio/min": 0.36347436904907227, + "sampling/sampling_logp_difference/max": 1.0120465755462646, + "sampling/sampling_logp_difference/mean": 0.014956055209040642, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 230.578125, + "completions/mean_terminated_length": 230.578125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.35280194878578186, + "epoch": 1.4424019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2057679055290507, + "kl": 0.13378706574440002, + "learning_rate": 6.216520433716544e-07, + "loss": 0.0099, + "num_tokens": 37279405.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4461430311203003, + "sampling/importance_sampling_ratio/mean": 0.9984460473060608, + "sampling/importance_sampling_ratio/min": 0.4309263527393341, + "sampling/sampling_logp_difference/max": 0.84181809425354, + "sampling/sampling_logp_difference/mean": 0.017107432708144188, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 193.15625, + "completions/mean_terminated_length": 193.15625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3700255751609802, + "epoch": 1.4436274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2374435367190664, + "kl": 0.12329597026109695, + "learning_rate": 6.209609477998338e-07, + "loss": 0.0842, + "num_tokens": 37312615.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992519617080688, + "sampling/importance_sampling_ratio/min": 0.6034554243087769, + "sampling/sampling_logp_difference/max": 0.7990565299987793, + "sampling/sampling_logp_difference/mean": 0.018614256754517555, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 159.984375, + "completions/mean_terminated_length": 159.984375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.33156856894493103, + "epoch": 1.4448529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6806987115221546, + "kl": 0.13869813084602356, + "learning_rate": 6.202696066849524e-07, + "loss": -0.008, + "num_tokens": 37337350.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.93405282497406, + "sampling/importance_sampling_ratio/mean": 0.9998618364334106, + "sampling/importance_sampling_ratio/min": 0.4030509293079376, + "sampling/sampling_logp_difference/max": 0.9086923599243164, + "sampling/sampling_logp_difference/mean": 0.017438728362321854, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 173.609375, + "completions/mean_terminated_length": 173.609375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3063565492630005, + "epoch": 1.446078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7073736999319093, + "kl": 0.11664046347141266, + "learning_rate": 6.195780214303887e-07, + "loss": -0.0128, + "num_tokens": 37371341.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8195306062698364, + "sampling/importance_sampling_ratio/mean": 1.0002837181091309, + "sampling/importance_sampling_ratio/min": 0.618789553642273, + "sampling/sampling_logp_difference/max": 0.5985785722732544, + "sampling/sampling_logp_difference/mean": 0.01647639088332653, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 233.90625, + "completions/mean_terminated_length": 233.90625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.27668413519859314, + "epoch": 1.4473039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0096771439449546, + "kl": 0.11705988645553589, + "learning_rate": 6.188861934400171e-07, + "loss": -0.0418, + "num_tokens": 37409799.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.7785027027130127, + "sampling/importance_sampling_ratio/mean": 1.0001142024993896, + "sampling/importance_sampling_ratio/min": 0.4169924557209015, + "sampling/sampling_logp_difference/max": 0.8746871948242188, + "sampling/sampling_logp_difference/mean": 0.014162404462695122, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 218.671875, + "completions/mean_terminated_length": 218.671875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.30626606941223145, + "epoch": 1.4485294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05596439045356626, + "kl": 0.09142415225505829, + "learning_rate": 6.181941241182043e-07, + "loss": 0.0009, + "num_tokens": 37446546.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4812122583389282, + "sampling/importance_sampling_ratio/mean": 0.9997881054878235, + "sampling/importance_sampling_ratio/min": 0.47619307041168213, + "sampling/sampling_logp_difference/max": 0.7419319152832031, + "sampling/sampling_logp_difference/mean": 0.01617203839123249, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 155.828125, + "completions/mean_terminated_length": 155.828125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.2763192057609558, + "epoch": 1.4497549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.5668047830796996, + "kl": 0.17243480682373047, + "learning_rate": 6.175018148698076e-07, + "loss": -0.0257, + "num_tokens": 37474951.0, + "reward": 0.15625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.702722191810608, + "sampling/importance_sampling_ratio/mean": 1.0007953643798828, + "sampling/importance_sampling_ratio/min": 0.5423819422721863, + "sampling/sampling_logp_difference/max": 0.611784815788269, + "sampling/sampling_logp_difference/mean": 0.01690756157040596, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 211.21875, + "completions/mean_terminated_length": 211.21875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.32851842045783997, + "epoch": 1.4509803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2834727068697949, + "kl": 0.17326781153678894, + "learning_rate": 6.168092671001705e-07, + "loss": 0.0135, + "num_tokens": 37509093.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8608806133270264, + "sampling/importance_sampling_ratio/mean": 1.0005269050598145, + "sampling/importance_sampling_ratio/min": 0.5097377300262451, + "sampling/sampling_logp_difference/max": 0.6738590002059937, + "sampling/sampling_logp_difference/mean": 0.017571967095136642, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 186.953125, + "completions/mean_terminated_length": 186.953125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.28477370738983154, + "epoch": 1.4522058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04943827233911848, + "kl": 0.10224197059869766, + "learning_rate": 6.161164822151213e-07, + "loss": 0.001, + "num_tokens": 37539602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5690999031066895, + "sampling/importance_sampling_ratio/mean": 0.9998160600662231, + "sampling/importance_sampling_ratio/min": 0.45969608426094055, + "sampling/sampling_logp_difference/max": 0.7771897315979004, + "sampling/sampling_logp_difference/mean": 0.015761321410536766, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 223.84375, + "completions/mean_terminated_length": 223.84375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.2302074134349823, + "epoch": 1.4534313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.015837087750094, + "kl": 0.08969669044017792, + "learning_rate": 6.154234616209692e-07, + "loss": -0.0176, + "num_tokens": 37573496.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6544609069824219, + "sampling/importance_sampling_ratio/mean": 1.0001399517059326, + "sampling/importance_sampling_ratio/min": 0.5637956857681274, + "sampling/sampling_logp_difference/max": 0.5730633735656738, + "sampling/sampling_logp_difference/mean": 0.012367047369480133, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 220.28125, + "completions/mean_terminated_length": 220.28125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3840216398239136, + "epoch": 1.454656862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.456764309807822, + "kl": 0.11951378732919693, + "learning_rate": 6.147302067245028e-07, + "loss": 0.0077, + "num_tokens": 37603866.0, + "reward": -0.1875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.5749192237854004, + "sampling/importance_sampling_ratio/mean": 0.999834418296814, + "sampling/importance_sampling_ratio/min": 0.059670716524124146, + "sampling/sampling_logp_difference/max": 2.8189139366149902, + "sampling/sampling_logp_difference/mean": 0.01715525984764099, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 197.015625, + "completions/mean_terminated_length": 197.015625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3232305645942688, + "epoch": 1.4558823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3193424143390062, + "kl": 0.10033085197210312, + "learning_rate": 6.140367189329847e-07, + "loss": -0.0531, + "num_tokens": 37634203.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.7066165208816528, + "sampling/importance_sampling_ratio/mean": 1.000067949295044, + "sampling/importance_sampling_ratio/min": 0.47672104835510254, + "sampling/sampling_logp_difference/max": 0.7408237457275391, + "sampling/sampling_logp_difference/mean": 0.01664803922176361, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 184.015625, + "completions/mean_terminated_length": 184.015625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3196144700050354, + "epoch": 1.4571078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11496818100434487, + "kl": 0.12379920482635498, + "learning_rate": 6.133429996541518e-07, + "loss": 0.0011, + "num_tokens": 37664444.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000017523765564, + "sampling/importance_sampling_ratio/min": 0.11415062844753265, + "sampling/sampling_logp_difference/max": 2.170236349105835, + "sampling/sampling_logp_difference/mean": 0.01712869480252266, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 219.15625, + "completions/mean_terminated_length": 219.15625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2887090742588043, + "epoch": 1.4583333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1348927038126526, + "kl": 0.11597549915313721, + "learning_rate": 6.1264905029621e-07, + "loss": 0.0097, + "num_tokens": 37700438.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6932430267333984, + "sampling/importance_sampling_ratio/mean": 1.0000178813934326, + "sampling/importance_sampling_ratio/min": 0.5735724568367004, + "sampling/sampling_logp_difference/max": 0.5558710694313049, + "sampling/sampling_logp_difference/mean": 0.015668664127588272, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 181.390625, + "completions/mean_terminated_length": 181.390625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.28322315216064453, + "epoch": 1.4595588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06817020188729356, + "kl": 0.14192894101142883, + "learning_rate": 6.119548722678327e-07, + "loss": 0.0014, + "num_tokens": 37733519.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99979567527771, + "sampling/importance_sampling_ratio/min": 0.6217517852783203, + "sampling/sampling_logp_difference/max": 0.7287606000900269, + "sampling/sampling_logp_difference/mean": 0.016558753326535225, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 191.125, + "completions/mean_terminated_length": 191.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.28417468070983887, + "epoch": 1.4607843137254901, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2412866245053409, + "kl": 0.10667260736227036, + "learning_rate": 6.112604669781572e-07, + "loss": 0.0356, + "num_tokens": 37763191.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6555081605911255, + "sampling/importance_sampling_ratio/mean": 1.0000934600830078, + "sampling/importance_sampling_ratio/min": 0.22631296515464783, + "sampling/sampling_logp_difference/max": 1.4858365058898926, + "sampling/sampling_logp_difference/mean": 0.014470456168055534, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 240.765625, + "completions/mean_terminated_length": 240.765625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.3636624813079834, + "epoch": 1.4620098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.578333476370098, + "kl": 0.11750581115484238, + "learning_rate": 6.105658358367822e-07, + "loss": 0.038, + "num_tokens": 37797688.0, + "reward": 0.625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.5071572065353394, + "sampling/importance_sampling_ratio/mean": 1.000314474105835, + "sampling/importance_sampling_ratio/min": 0.6122506260871887, + "sampling/sampling_logp_difference/max": 0.4906136393547058, + "sampling/sampling_logp_difference/mean": 0.01553491409868002, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 170.125, + "completions/mean_terminated_length": 170.125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.219697505235672, + "epoch": 1.4632352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044637562856320645, + "kl": 0.08123414218425751, + "learning_rate": 6.098709802537653e-07, + "loss": 0.0009, + "num_tokens": 37821888.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9099587202072144, + "sampling/importance_sampling_ratio/mean": 1.0004470348358154, + "sampling/importance_sampling_ratio/min": 0.48237401247024536, + "sampling/sampling_logp_difference/max": 0.729035496711731, + "sampling/sampling_logp_difference/mean": 0.01361137256026268, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 207.03125, + "completions/mean_terminated_length": 207.03125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2628161609172821, + "epoch": 1.4644607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4571893795286242, + "kl": 0.08402369171380997, + "learning_rate": 6.091759016396188e-07, + "loss": -0.0356, + "num_tokens": 37852498.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5961744785308838, + "sampling/importance_sampling_ratio/mean": 1.0001986026763916, + "sampling/importance_sampling_ratio/min": 0.49821826815605164, + "sampling/sampling_logp_difference/max": 0.6967170238494873, + "sampling/sampling_logp_difference/mean": 0.014230694621801376, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 159.4375, + "completions/mean_terminated_length": 159.4375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2504669427871704, + "epoch": 1.465686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.307576053585658, + "kl": 0.12320666015148163, + "learning_rate": 6.084806014053086e-07, + "loss": 0.0013, + "num_tokens": 37878766.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.530218482017517, + "sampling/importance_sampling_ratio/mean": 0.9995049238204956, + "sampling/importance_sampling_ratio/min": 0.5890116095542908, + "sampling/sampling_logp_difference/max": 0.5293093323707581, + "sampling/sampling_logp_difference/mean": 0.013893891125917435, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 173.84375, + "completions/mean_terminated_length": 173.84375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.1905520260334015, + "epoch": 1.4669117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2072480243072812, + "kl": 0.07715676724910736, + "learning_rate": 6.077850809622498e-07, + "loss": 0.038, + "num_tokens": 37907428.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9979921579360962, + "sampling/importance_sampling_ratio/mean": 1.0002038478851318, + "sampling/importance_sampling_ratio/min": 0.6058018207550049, + "sampling/sampling_logp_difference/max": 0.6921427249908447, + "sampling/sampling_logp_difference/mean": 0.010474804788827896, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 183.953125, + "completions/mean_terminated_length": 183.953125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.32976096868515015, + "epoch": 1.468137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6320427464177791, + "kl": 0.19351902604103088, + "learning_rate": 6.070893417223052e-07, + "loss": 0.0134, + "num_tokens": 37933073.0, + "reward": 0.59375, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6176202297210693, + "sampling/importance_sampling_ratio/mean": 0.9999404549598694, + "sampling/importance_sampling_ratio/min": 0.5917201042175293, + "sampling/sampling_logp_difference/max": 0.5247215628623962, + "sampling/sampling_logp_difference/mean": 0.016002189368009567, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 181.5625, + "completions/mean_terminated_length": 181.5625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.31580162048339844, + "epoch": 1.469362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3271427980708945, + "kl": 0.22780528664588928, + "learning_rate": 6.06393385097781e-07, + "loss": -0.0021, + "num_tokens": 37964853.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004262924194336, + "sampling/importance_sampling_ratio/min": 0.5923798084259033, + "sampling/sampling_logp_difference/max": 1.3286752700805664, + "sampling/sampling_logp_difference/mean": 0.014728747308254242, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 193.046875, + "completions/mean_terminated_length": 193.046875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.26126304268836975, + "epoch": 1.4705882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.753164343747916, + "kl": 0.15420006215572357, + "learning_rate": 6.056972125014254e-07, + "loss": -0.0152, + "num_tokens": 37994568.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4986714124679565, + "sampling/importance_sampling_ratio/mean": 0.9999833106994629, + "sampling/importance_sampling_ratio/min": 0.47390836477279663, + "sampling/sampling_logp_difference/max": 0.7467412948608398, + "sampling/sampling_logp_difference/mean": 0.012895776890218258, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 227.796875, + "completions/mean_terminated_length": 227.796875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.32024574279785156, + "epoch": 1.471813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.17119847616517, + "kl": 0.11787056922912598, + "learning_rate": 6.050008253464246e-07, + "loss": 0.0141, + "num_tokens": 38027835.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5791263580322266, + "sampling/importance_sampling_ratio/mean": 0.9998366832733154, + "sampling/importance_sampling_ratio/min": 0.5746538043022156, + "sampling/sampling_logp_difference/max": 0.5539875030517578, + "sampling/sampling_logp_difference/mean": 0.015224370174109936, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 187.53125, + "completions/mean_terminated_length": 187.53125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.29485833644866943, + "epoch": 1.4730392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05094943383238978, + "kl": 0.10031016170978546, + "learning_rate": 6.043042250464004e-07, + "loss": 0.001, + "num_tokens": 38062349.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7593786716461182, + "sampling/importance_sampling_ratio/mean": 1.0004372596740723, + "sampling/importance_sampling_ratio/min": 0.5719299912452698, + "sampling/sampling_logp_difference/max": 0.5649607181549072, + "sampling/sampling_logp_difference/mean": 0.017132576555013657, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 197.03125, + "completions/mean_terminated_length": 197.03125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.30228111147880554, + "epoch": 1.4742647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3274662296376223, + "kl": 0.16679570078849792, + "learning_rate": 6.036074130154071e-07, + "loss": -0.0177, + "num_tokens": 38091711.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.624650478363037, + "sampling/importance_sampling_ratio/mean": 1.0004812479019165, + "sampling/importance_sampling_ratio/min": 0.5937528014183044, + "sampling/sampling_logp_difference/max": 0.5212922096252441, + "sampling/sampling_logp_difference/mean": 0.015419438481330872, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 220.03125, + "completions/mean_terminated_length": 220.03125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.27057573199272156, + "epoch": 1.4754901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8076838544415822, + "kl": 0.07092368602752686, + "learning_rate": 6.029103906679293e-07, + "loss": -0.0013, + "num_tokens": 38122433.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6192903518676758, + "sampling/importance_sampling_ratio/mean": 0.9996510148048401, + "sampling/importance_sampling_ratio/min": 0.5873081684112549, + "sampling/sampling_logp_difference/max": 0.5322055816650391, + "sampling/sampling_logp_difference/mean": 0.013554556295275688, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 179.5, + "completions/mean_terminated_length": 179.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.23620840907096863, + "epoch": 1.4767156862745099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04665933164818391, + "kl": 0.08392497897148132, + "learning_rate": 6.022131594188777e-07, + "loss": 0.0008, + "num_tokens": 38158353.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6152162551879883, + "sampling/importance_sampling_ratio/mean": 0.9992532730102539, + "sampling/importance_sampling_ratio/min": 0.5516251921653748, + "sampling/sampling_logp_difference/max": 0.5948865413665771, + "sampling/sampling_logp_difference/mean": 0.013747544959187508, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 207.859375, + "completions/mean_terminated_length": 207.859375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2698655128479004, + "epoch": 1.4779411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07123002636763835, + "kl": 0.08243486285209656, + "learning_rate": 6.01515720683588e-07, + "loss": 0.0009, + "num_tokens": 38185272.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6216869354248047, + "sampling/importance_sampling_ratio/mean": 0.9996257424354553, + "sampling/importance_sampling_ratio/min": 0.591215968132019, + "sampling/sampling_logp_difference/max": 0.5255739688873291, + "sampling/sampling_logp_difference/mean": 0.01550292782485485, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 212.953125, + "completions/mean_terminated_length": 212.953125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.29221758246421814, + "epoch": 1.4791666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1799150266306437, + "kl": 0.11821890622377396, + "learning_rate": 6.008180758778166e-07, + "loss": 0.0338, + "num_tokens": 38216533.0, + "reward": -0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998124837875366, + "sampling/importance_sampling_ratio/min": 0.3483121991157532, + "sampling/sampling_logp_difference/max": 1.0546560287475586, + "sampling/sampling_logp_difference/mean": 0.017921222373843193, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 238.28125, + "completions/mean_terminated_length": 238.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3860968351364136, + "epoch": 1.4803921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.323393463562757, + "kl": 0.14008009433746338, + "learning_rate": 6.001202264177382e-07, + "loss": 0.0007, + "num_tokens": 38250951.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000505805015564, + "sampling/importance_sampling_ratio/min": 0.34883177280426025, + "sampling/sampling_logp_difference/max": 1.0531654357910156, + "sampling/sampling_logp_difference/mean": 0.018849536776542664, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 202.71875, + "completions/mean_terminated_length": 202.71875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2686987519264221, + "epoch": 1.4816176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04456395390141526, + "kl": 0.08406025171279907, + "learning_rate": 5.99422173719943e-07, + "loss": 0.0008, + "num_tokens": 38280661.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999376535415649, + "sampling/importance_sampling_ratio/min": 0.5367775559425354, + "sampling/sampling_logp_difference/max": 0.8054203987121582, + "sampling/sampling_logp_difference/mean": 0.014889596030116081, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 192.96875, + "completions/mean_terminated_length": 192.96875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3588130474090576, + "epoch": 1.482843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06278643958629222, + "kl": 0.12569394707679749, + "learning_rate": 5.987239192014335e-07, + "loss": 0.0013, + "num_tokens": 38314995.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6949001550674438, + "sampling/importance_sampling_ratio/mean": 0.9996659755706787, + "sampling/importance_sampling_ratio/min": 0.4953877627849579, + "sampling/sampling_logp_difference/max": 0.7024144530296326, + "sampling/sampling_logp_difference/mean": 0.019107583910226822, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 189.609375, + "completions/mean_terminated_length": 189.609375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3276250958442688, + "epoch": 1.4840686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3275975574656989, + "kl": 0.16689805686473846, + "learning_rate": 5.980254642796226e-07, + "loss": -0.0044, + "num_tokens": 38344090.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5865861177444458, + "sampling/importance_sampling_ratio/mean": 0.9994770288467407, + "sampling/importance_sampling_ratio/min": 0.3266303837299347, + "sampling/sampling_logp_difference/max": 1.1189260482788086, + "sampling/sampling_logp_difference/mean": 0.015126791782677174, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 152.296875, + "completions/mean_terminated_length": 152.296875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.28168967366218567, + "epoch": 1.4852941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3328207477552263, + "kl": 0.13398806750774384, + "learning_rate": 5.973268103723293e-07, + "loss": 0.0074, + "num_tokens": 38368861.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8144325017929077, + "sampling/importance_sampling_ratio/mean": 1.0001928806304932, + "sampling/importance_sampling_ratio/min": 0.6089951395988464, + "sampling/sampling_logp_difference/max": 0.5957727432250977, + "sampling/sampling_logp_difference/mean": 0.01566866785287857, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 325.59375, + "completions/mean_terminated_length": 325.59375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.3338436782360077, + "epoch": 1.4865196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03745037643506664, + "kl": 0.06090451776981354, + "learning_rate": 5.966279588977766e-07, + "loss": 0.0006, + "num_tokens": 38410163.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7999895811080933, + "sampling/importance_sampling_ratio/mean": 0.9999022483825684, + "sampling/importance_sampling_ratio/min": 0.5719600319862366, + "sampling/sampling_logp_difference/max": 0.5877808928489685, + "sampling/sampling_logp_difference/mean": 0.014957626350224018, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 246.15625, + "completions/mean_terminated_length": 246.15625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.27992749214172363, + "epoch": 1.4877450980392157, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.872454806566852, + "kl": 0.09845226258039474, + "learning_rate": 5.959289112745891e-07, + "loss": 0.0565, + "num_tokens": 38444045.0, + "reward": 0.71875, + "reward_std": 0.565913200378418, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999333620071411, + "sampling/importance_sampling_ratio/min": 0.35846590995788574, + "sampling/sampling_logp_difference/max": 1.0259218215942383, + "sampling/sampling_logp_difference/mean": 0.013677786104381084, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 239.796875, + "completions/mean_terminated_length": 239.796875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.26544874906539917, + "epoch": 1.4889705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7343358666089387, + "kl": 0.136094868183136, + "learning_rate": 5.952296689217889e-07, + "loss": -0.0005, + "num_tokens": 38475248.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7804735898971558, + "sampling/importance_sampling_ratio/mean": 1.0001195669174194, + "sampling/importance_sampling_ratio/min": 0.42388060688972473, + "sampling/sampling_logp_difference/max": 0.858303427696228, + "sampling/sampling_logp_difference/mean": 0.014023521915078163, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 147.109375, + "completions/mean_terminated_length": 147.109375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.17752957344055176, + "epoch": 1.4901960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05915417617703328, + "kl": 0.08745570480823517, + "learning_rate": 5.945302332587938e-07, + "loss": 0.0009, + "num_tokens": 38501255.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002703666687012, + "sampling/importance_sampling_ratio/min": 0.5038222074508667, + "sampling/sampling_logp_difference/max": 0.8376307487487793, + "sampling/sampling_logp_difference/mean": 0.012213384732604027, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 220.375, + "completions/mean_terminated_length": 220.375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2942553460597992, + "epoch": 1.491421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3111210153575885, + "kl": 0.1029469296336174, + "learning_rate": 5.938306057054138e-07, + "loss": -0.0338, + "num_tokens": 38531711.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000220775604248, + "sampling/importance_sampling_ratio/min": 0.6058831810951233, + "sampling/sampling_logp_difference/max": 0.8302359580993652, + "sampling/sampling_logp_difference/mean": 0.013989787548780441, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 259.125, + "completions/mean_terminated_length": 259.125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.20770685374736786, + "epoch": 1.4926470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02846568663031406, + "kl": 0.059119801968336105, + "learning_rate": 5.931307876818487e-07, + "loss": 0.0005, + "num_tokens": 38567207.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5212385654449463, + "sampling/importance_sampling_ratio/mean": 1.000232458114624, + "sampling/importance_sampling_ratio/min": 0.6117788553237915, + "sampling/sampling_logp_difference/max": 0.49138450622558594, + "sampling/sampling_logp_difference/mean": 0.01106779370456934, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 206.28125, + "completions/mean_terminated_length": 206.28125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.29796284437179565, + "epoch": 1.4938725490196079, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.648576252755327, + "kl": 0.20413929224014282, + "learning_rate": 5.924307806086843e-07, + "loss": 0.015, + "num_tokens": 38596681.0, + "reward": 0.3125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6358323097229004, + "sampling/importance_sampling_ratio/mean": 1.000525951385498, + "sampling/importance_sampling_ratio/min": 0.5990678668022156, + "sampling/sampling_logp_difference/max": 0.5123803615570068, + "sampling/sampling_logp_difference/mean": 0.015031736344099045, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 192.0625, + "completions/mean_terminated_length": 192.0625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.20849837362766266, + "epoch": 1.4950980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050280758429565336, + "kl": 0.08213002979755402, + "learning_rate": 5.917305859068911e-07, + "loss": 0.0008, + "num_tokens": 38626109.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5541421175003052, + "sampling/importance_sampling_ratio/mean": 1.0002679824829102, + "sampling/importance_sampling_ratio/min": 0.5370172262191772, + "sampling/sampling_logp_difference/max": 0.6217250823974609, + "sampling/sampling_logp_difference/mean": 0.011642636731266975, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 238.890625, + "completions/mean_terminated_length": 238.890625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.22361789643764496, + "epoch": 1.4963235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04288308873635731, + "kl": 0.08960422873497009, + "learning_rate": 5.910302049978199e-07, + "loss": 0.0009, + "num_tokens": 38658694.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5146970748901367, + "sampling/importance_sampling_ratio/mean": 0.9996353387832642, + "sampling/importance_sampling_ratio/min": 0.6171490550041199, + "sampling/sampling_logp_difference/max": 0.4826446771621704, + "sampling/sampling_logp_difference/mean": 0.012359712272882462, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 196.140625, + "completions/mean_terminated_length": 196.140625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.33020782470703125, + "epoch": 1.4975490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7846041729866438, + "kl": 0.1470070332288742, + "learning_rate": 5.903296393031995e-07, + "loss": 0.0156, + "num_tokens": 38689423.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9609761238098145, + "sampling/importance_sampling_ratio/mean": 0.9995192885398865, + "sampling/importance_sampling_ratio/min": 0.16146506369113922, + "sampling/sampling_logp_difference/max": 1.8234665393829346, + "sampling/sampling_logp_difference/mean": 0.017301514744758606, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 256.671875, + "completions/mean_terminated_length": 256.671875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.25761643052101135, + "epoch": 1.4987745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03820689698028908, + "kl": 0.06466948240995407, + "learning_rate": 5.896288902451338e-07, + "loss": 0.0006, + "num_tokens": 38728570.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.818019151687622, + "sampling/importance_sampling_ratio/mean": 0.9998496770858765, + "sampling/importance_sampling_ratio/min": 0.4847292900085449, + "sampling/sampling_logp_difference/max": 0.7241647243499756, + "sampling/sampling_logp_difference/mean": 0.013553157448768616, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 190.65625, + "completions/mean_terminated_length": 190.65625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.19422778487205505, + "epoch": 1.5, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1709296490721015, + "kl": 0.07536928355693817, + "learning_rate": 5.88927959246099e-07, + "loss": -0.0322, + "num_tokens": 38756804.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.8888996839523315, + "sampling/importance_sampling_ratio/mean": 1.00026273727417, + "sampling/importance_sampling_ratio/min": 0.6327093243598938, + "sampling/sampling_logp_difference/max": 0.6359944343566895, + "sampling/sampling_logp_difference/mean": 0.009969940409064293, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 189.75, + "completions/mean_terminated_length": 189.75, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.30658310651779175, + "epoch": 1.5012254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06938694861133382, + "kl": 0.11786216497421265, + "learning_rate": 5.882268477289408e-07, + "loss": 0.0012, + "num_tokens": 38790228.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5873552560806274, + "sampling/importance_sampling_ratio/mean": 1.00019371509552, + "sampling/importance_sampling_ratio/min": 0.5111851096153259, + "sampling/sampling_logp_difference/max": 0.6710236072540283, + "sampling/sampling_logp_difference/mean": 0.015146405436098576, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1453.0, + "completions/max_terminated_length": 1453.0, + "completions/mean_length": 268.1875, + "completions/mean_terminated_length": 268.1875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.25314539670944214, + "epoch": 1.5024509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035721603474821256, + "kl": 0.06015956401824951, + "learning_rate": 5.875255571168709e-07, + "loss": 0.0006, + "num_tokens": 38823360.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6903550624847412, + "sampling/importance_sampling_ratio/mean": 1.0005155801773071, + "sampling/importance_sampling_ratio/min": 0.24656350910663605, + "sampling/sampling_logp_difference/max": 1.4001357555389404, + "sampling/sampling_logp_difference/mean": 0.013511475175619125, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 185.125, + "completions/mean_terminated_length": 185.125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.228925883769989, + "epoch": 1.5036764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04026923665129843, + "kl": 0.08304295688867569, + "learning_rate": 5.868240888334652e-07, + "loss": 0.0008, + "num_tokens": 38851480.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6771751642227173, + "sampling/importance_sampling_ratio/mean": 1.0003247261047363, + "sampling/importance_sampling_ratio/min": 0.6547366380691528, + "sampling/sampling_logp_difference/max": 0.5171109437942505, + "sampling/sampling_logp_difference/mean": 0.013526812195777893, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 269.734375, + "completions/mean_terminated_length": 269.734375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3207527995109558, + "epoch": 1.5049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11007261581515085, + "kl": 0.08292139321565628, + "learning_rate": 5.861224443026595e-07, + "loss": 0.0009, + "num_tokens": 38888967.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7183313369750977, + "sampling/importance_sampling_ratio/mean": 1.0000386238098145, + "sampling/importance_sampling_ratio/min": 0.5618218183517456, + "sampling/sampling_logp_difference/max": 0.5765705108642578, + "sampling/sampling_logp_difference/mean": 0.015759730711579323, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 217.859375, + "completions/mean_terminated_length": 217.859375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.27605897188186646, + "epoch": 1.5061274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.009356334476998, + "kl": 0.11253257095813751, + "learning_rate": 5.854206249487478e-07, + "loss": -0.0152, + "num_tokens": 38917854.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.9505116939544678, + "sampling/importance_sampling_ratio/mean": 0.9997677803039551, + "sampling/importance_sampling_ratio/min": 0.38817811012268066, + "sampling/sampling_logp_difference/max": 0.9462909698486328, + "sampling/sampling_logp_difference/mean": 0.013493069447577, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 240.109375, + "completions/mean_terminated_length": 240.109375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.21764399111270905, + "epoch": 1.5073529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07106719898330048, + "kl": 0.08413289487361908, + "learning_rate": 5.847186321963792e-07, + "loss": 0.0008, + "num_tokens": 38951813.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6368263959884644, + "sampling/importance_sampling_ratio/mean": 0.9997113943099976, + "sampling/importance_sampling_ratio/min": 0.5479965806007385, + "sampling/sampling_logp_difference/max": 0.6014862060546875, + "sampling/sampling_logp_difference/mean": 0.01225600577890873, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 216.34375, + "completions/mean_terminated_length": 216.34375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.18998324871063232, + "epoch": 1.508578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0487005525945726, + "kl": 0.0694509744644165, + "learning_rate": 5.840164674705542e-07, + "loss": 0.0006, + "num_tokens": 38984347.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.951185703277588, + "sampling/importance_sampling_ratio/mean": 0.9997248649597168, + "sampling/importance_sampling_ratio/min": 0.3915136456489563, + "sampling/sampling_logp_difference/max": 0.937734842300415, + "sampling/sampling_logp_difference/mean": 0.012988218106329441, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 219.5625, + "completions/mean_terminated_length": 219.5625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.26098933815956116, + "epoch": 1.5098039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10252765873181363, + "kl": 0.09128749370574951, + "learning_rate": 5.833141321966228e-07, + "loss": 0.0009, + "num_tokens": 39019167.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5953713655471802, + "sampling/importance_sampling_ratio/mean": 0.9997473359107971, + "sampling/importance_sampling_ratio/min": 0.13503998517990112, + "sampling/sampling_logp_difference/max": 2.0021843910217285, + "sampling/sampling_logp_difference/mean": 0.01455174945294857, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 194.484375, + "completions/mean_terminated_length": 194.484375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.18635328114032745, + "epoch": 1.5110294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048930907499501734, + "kl": 0.08086130768060684, + "learning_rate": 5.826116278002813e-07, + "loss": 0.0008, + "num_tokens": 39046158.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9421849250793457, + "sampling/importance_sampling_ratio/mean": 1.0004785060882568, + "sampling/importance_sampling_ratio/min": 0.587902307510376, + "sampling/sampling_logp_difference/max": 0.663813591003418, + "sampling/sampling_logp_difference/mean": 0.011267730966210365, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.32486265897750854, + "epoch": 1.5122549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4183870647964283, + "kl": 0.09388802945613861, + "learning_rate": 5.819089557075688e-07, + "loss": 0.0367, + "num_tokens": 39081578.0, + "reward": 0.625, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002061128616333, + "sampling/importance_sampling_ratio/min": 0.32993054389953613, + "sampling/sampling_logp_difference/max": 1.1088731288909912, + "sampling/sampling_logp_difference/mean": 0.016056686639785767, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 226.125, + "completions/mean_terminated_length": 226.125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.23978698253631592, + "epoch": 1.5134803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4485751012632275, + "kl": 0.15217433869838715, + "learning_rate": 5.812061173448654e-07, + "loss": -0.0047, + "num_tokens": 39116690.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.624997854232788, + "sampling/importance_sampling_ratio/mean": 1.0002484321594238, + "sampling/importance_sampling_ratio/min": 0.5305609107017517, + "sampling/sampling_logp_difference/max": 0.6338205337524414, + "sampling/sampling_logp_difference/mean": 0.013219879940152168, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 291.484375, + "completions/mean_terminated_length": 291.484375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.370257705450058, + "epoch": 1.5147058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2805301030228824, + "kl": 0.1061616837978363, + "learning_rate": 5.805031141388883e-07, + "loss": 0.0032, + "num_tokens": 39156513.0, + "reward": 0.46875, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 0.9997502565383911, + "sampling/importance_sampling_ratio/min": 0.528445839881897, + "sampling/sampling_logp_difference/max": 0.637814998626709, + "sampling/sampling_logp_difference/mean": 0.01492888294160366, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 239.625, + "completions/mean_terminated_length": 239.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.21345660090446472, + "epoch": 1.5159313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02876361581332321, + "kl": 0.0695776715874672, + "learning_rate": 5.797999475166896e-07, + "loss": 0.0006, + "num_tokens": 39203513.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4442031383514404, + "sampling/importance_sampling_ratio/mean": 0.9994962215423584, + "sampling/importance_sampling_ratio/min": 0.42204251885414124, + "sampling/sampling_logp_difference/max": 0.8626492023468018, + "sampling/sampling_logp_difference/mean": 0.012082409113645554, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 207.71875, + "completions/mean_terminated_length": 207.71875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3437170386314392, + "epoch": 1.517156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0526115728837453, + "kl": 0.1577879637479782, + "learning_rate": 5.790966189056529e-07, + "loss": 0.0024, + "num_tokens": 39236583.0, + "reward": -0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.8864585161209106, + "sampling/importance_sampling_ratio/mean": 0.9998160004615784, + "sampling/importance_sampling_ratio/min": 0.6117802262306213, + "sampling/sampling_logp_difference/max": 0.6347012519836426, + "sampling/sampling_logp_difference/mean": 0.016931496560573578, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 232.25, + "completions/mean_terminated_length": 232.25, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.30623072385787964, + "epoch": 1.5183823529411766, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5214898330188613, + "kl": 0.15095746517181396, + "learning_rate": 5.783931297334907e-07, + "loss": 0.0384, + "num_tokens": 39273879.0, + "reward": 0.59375, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.974285364151001, + "sampling/importance_sampling_ratio/mean": 0.9998407959938049, + "sampling/importance_sampling_ratio/min": 0.5403859615325928, + "sampling/sampling_logp_difference/max": 0.6802065372467041, + "sampling/sampling_logp_difference/mean": 0.015204276889562607, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 257.484375, + "completions/mean_terminated_length": 257.484375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.26164013147354126, + "epoch": 1.5196078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042319742710021084, + "kl": 0.10378521680831909, + "learning_rate": 5.776894814282415e-07, + "loss": 0.0008, + "num_tokens": 39307622.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.574885606765747, + "sampling/importance_sampling_ratio/mean": 0.999725878238678, + "sampling/importance_sampling_ratio/min": 0.6167843341827393, + "sampling/sampling_logp_difference/max": 0.48323583602905273, + "sampling/sampling_logp_difference/mean": 0.013524348847568035, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 199.984375, + "completions/mean_terminated_length": 199.984375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.288371205329895, + "epoch": 1.5208333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1808160531731653, + "kl": 0.1048680916428566, + "learning_rate": 5.769856754182667e-07, + "loss": -0.0152, + "num_tokens": 39339237.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5963785648345947, + "sampling/importance_sampling_ratio/mean": 0.9998101592063904, + "sampling/importance_sampling_ratio/min": 0.629842221736908, + "sampling/sampling_logp_difference/max": 0.46773767471313477, + "sampling/sampling_logp_difference/mean": 0.01503780297935009, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 228.203125, + "completions/mean_terminated_length": 228.203125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2760133743286133, + "epoch": 1.5220588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1542302053264695, + "kl": 0.08837348222732544, + "learning_rate": 5.762817131322481e-07, + "loss": -0.0054, + "num_tokens": 39370978.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.9127037525177002, + "sampling/importance_sampling_ratio/mean": 0.9995803833007812, + "sampling/importance_sampling_ratio/min": 0.3032958507537842, + "sampling/sampling_logp_difference/max": 1.1930465698242188, + "sampling/sampling_logp_difference/mean": 0.01568417251110077, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 220.234375, + "completions/mean_terminated_length": 220.234375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.2781566083431244, + "epoch": 1.5232843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.149632577200551, + "kl": 0.1296968013048172, + "learning_rate": 5.755775959991844e-07, + "loss": 0.015, + "num_tokens": 39402817.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6944935321807861, + "sampling/importance_sampling_ratio/mean": 0.9998199939727783, + "sampling/importance_sampling_ratio/min": 0.04180217534303665, + "sampling/sampling_logp_difference/max": 3.174806833267212, + "sampling/sampling_logp_difference/mean": 0.013846222311258316, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 147.9375, + "completions/mean_terminated_length": 147.9375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.26521608233451843, + "epoch": 1.5245098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3353712233418034, + "kl": 0.12796512246131897, + "learning_rate": 5.74873325448389e-07, + "loss": -0.0034, + "num_tokens": 39427325.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6949468851089478, + "sampling/importance_sampling_ratio/mean": 0.9996969699859619, + "sampling/importance_sampling_ratio/min": 0.6048211455345154, + "sampling/sampling_logp_difference/max": 0.5276514291763306, + "sampling/sampling_logp_difference/mean": 0.016097765415906906, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 217.703125, + "completions/mean_terminated_length": 217.703125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.23921138048171997, + "epoch": 1.5257352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7403096642705569, + "kl": 0.09272611141204834, + "learning_rate": 5.741689029094861e-07, + "loss": -0.0081, + "num_tokens": 39457690.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.7083966732025146, + "sampling/importance_sampling_ratio/mean": 1.000200629234314, + "sampling/importance_sampling_ratio/min": 0.6069169044494629, + "sampling/sampling_logp_difference/max": 0.535555362701416, + "sampling/sampling_logp_difference/mean": 0.011944685131311417, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 199.59375, + "completions/mean_terminated_length": 199.59375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2644798159599304, + "epoch": 1.5269607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5442020790508673, + "kl": 0.10834425687789917, + "learning_rate": 5.73464329812409e-07, + "loss": 0.001, + "num_tokens": 39485184.0, + "reward": -0.3125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6782118082046509, + "sampling/importance_sampling_ratio/mean": 0.9999837875366211, + "sampling/importance_sampling_ratio/min": 0.5354776978492737, + "sampling/sampling_logp_difference/max": 0.6245959997177124, + "sampling/sampling_logp_difference/mean": 0.01373962964862585, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 221.21875, + "completions/mean_terminated_length": 221.21875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.22620631754398346, + "epoch": 1.528186274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3353967451302322, + "kl": 0.09754102677106857, + "learning_rate": 5.727596075873965e-07, + "loss": 0.0354, + "num_tokens": 39514190.0, + "reward": 0.4375, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6716618537902832, + "sampling/importance_sampling_ratio/mean": 0.9996615648269653, + "sampling/importance_sampling_ratio/min": 0.5687833428382874, + "sampling/sampling_logp_difference/max": 0.5642556548118591, + "sampling/sampling_logp_difference/mean": 0.011098390445113182, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 193.03125, + "completions/mean_terminated_length": 193.03125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.2537536025047302, + "epoch": 1.5294117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.105612797903851, + "kl": 0.18153579533100128, + "learning_rate": 5.7205473766499e-07, + "loss": 0.0515, + "num_tokens": 39546144.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009520053863525, + "sampling/importance_sampling_ratio/min": 0.6239134669303894, + "sampling/sampling_logp_difference/max": 0.807680606842041, + "sampling/sampling_logp_difference/mean": 0.014567977748811245, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 169.390625, + "completions/mean_terminated_length": 169.390625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.23550954461097717, + "epoch": 1.530637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.09223664492514, + "kl": 0.1612507551908493, + "learning_rate": 5.71349721476031e-07, + "loss": -0.0081, + "num_tokens": 39577081.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5416961908340454, + "sampling/importance_sampling_ratio/mean": 0.9995707869529724, + "sampling/importance_sampling_ratio/min": 0.6337987780570984, + "sampling/sampling_logp_difference/max": 0.45602381229400635, + "sampling/sampling_logp_difference/mean": 0.0124875633046031, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 237.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.35919585824012756, + "epoch": 1.531862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2726428466658852, + "kl": 0.13468556106090546, + "learning_rate": 5.706445604516574e-07, + "loss": 0.0094, + "num_tokens": 39620777.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.846413254737854, + "sampling/importance_sampling_ratio/mean": 0.9995013475418091, + "sampling/importance_sampling_ratio/min": 0.36016330122947693, + "sampling/sampling_logp_difference/max": 1.02119779586792, + "sampling/sampling_logp_difference/mean": 0.018313605338335037, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 236.328125, + "completions/mean_terminated_length": 236.328125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2343226969242096, + "epoch": 1.5330882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3112108523889763, + "kl": 0.09341298043727875, + "learning_rate": 5.699392560233017e-07, + "loss": 0.0829, + "num_tokens": 39653214.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4505730867385864, + "sampling/importance_sampling_ratio/mean": 0.9998184442520142, + "sampling/importance_sampling_ratio/min": 0.4843129515647888, + "sampling/sampling_logp_difference/max": 0.7250239849090576, + "sampling/sampling_logp_difference/mean": 0.013272561132907867, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 227.75, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.23711419105529785, + "epoch": 1.534313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.111938035878301, + "kl": 0.10805274546146393, + "learning_rate": 5.69233809622687e-07, + "loss": 0.0112, + "num_tokens": 39685150.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5494507551193237, + "sampling/importance_sampling_ratio/mean": 0.9999529123306274, + "sampling/importance_sampling_ratio/min": 0.5428563356399536, + "sampling/sampling_logp_difference/max": 0.6109106540679932, + "sampling/sampling_logp_difference/mean": 0.012215834110975266, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 202.34375, + "completions/mean_terminated_length": 202.34375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2381390482187271, + "epoch": 1.5355392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6833653141312037, + "kl": 0.0950554683804512, + "learning_rate": 5.685282226818249e-07, + "loss": 0.0177, + "num_tokens": 39719092.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999245405197144, + "sampling/importance_sampling_ratio/min": 0.5272694826126099, + "sampling/sampling_logp_difference/max": 1.5067219734191895, + "sampling/sampling_logp_difference/mean": 0.013562340289354324, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 208.296875, + "completions/mean_terminated_length": 208.296875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.34556320309638977, + "epoch": 1.5367647058823528, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7188936050742745, + "kl": 0.12145892530679703, + "learning_rate": 5.678224966330119e-07, + "loss": 0.0149, + "num_tokens": 39753495.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6007500886917114, + "sampling/importance_sampling_ratio/mean": 0.9999268054962158, + "sampling/importance_sampling_ratio/min": 0.5603798031806946, + "sampling/sampling_logp_difference/max": 0.5791404247283936, + "sampling/sampling_logp_difference/mean": 0.016672348603606224, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 177.609375, + "completions/mean_terminated_length": 177.609375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.23078593611717224, + "epoch": 1.5379901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06222337763973542, + "kl": 0.08189937472343445, + "learning_rate": 5.671166329088277e-07, + "loss": 0.0008, + "num_tokens": 39780702.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6662049293518066, + "sampling/importance_sampling_ratio/mean": 1.0002987384796143, + "sampling/importance_sampling_ratio/min": 0.47876015305519104, + "sampling/sampling_logp_difference/max": 0.7365555167198181, + "sampling/sampling_logp_difference/mean": 0.013800526969134808, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 243.78125, + "completions/mean_terminated_length": 243.78125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2520521283149719, + "epoch": 1.5392156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2861153798710994, + "kl": 0.06537686288356781, + "learning_rate": 5.664106329421305e-07, + "loss": -0.0495, + "num_tokens": 39814560.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.646364450454712, + "sampling/importance_sampling_ratio/mean": 1.0000898838043213, + "sampling/importance_sampling_ratio/min": 0.5492440462112427, + "sampling/sampling_logp_difference/max": 0.5992124080657959, + "sampling/sampling_logp_difference/mean": 0.012905338779091835, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 215.3125, + "completions/mean_terminated_length": 215.3125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2798084616661072, + "epoch": 1.5404411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062002073670821556, + "kl": 0.09289909899234772, + "learning_rate": 5.657044981660559e-07, + "loss": 0.0009, + "num_tokens": 39848468.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5731983184814453, + "sampling/importance_sampling_ratio/mean": 0.9994695782661438, + "sampling/importance_sampling_ratio/min": 0.5772298574447632, + "sampling/sampling_logp_difference/max": 0.5495147705078125, + "sampling/sampling_logp_difference/mean": 0.014737410470843315, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 167.984375, + "completions/mean_terminated_length": 167.984375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.23264190554618835, + "epoch": 1.5416666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10382213039474375, + "kl": 0.10797363519668579, + "learning_rate": 5.649982300140123e-07, + "loss": 0.0011, + "num_tokens": 39876115.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8272643089294434, + "sampling/importance_sampling_ratio/mean": 0.999854564666748, + "sampling/importance_sampling_ratio/min": 0.585029125213623, + "sampling/sampling_logp_difference/max": 0.6028199195861816, + "sampling/sampling_logp_difference/mean": 0.013319024816155434, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 173.421875, + "completions/mean_terminated_length": 173.421875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.23104506731033325, + "epoch": 1.5428921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.384033702678028, + "kl": 0.08499298989772797, + "learning_rate": 5.642918299196796e-07, + "loss": -0.0176, + "num_tokens": 39902078.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6162153482437134, + "sampling/importance_sampling_ratio/mean": 1.0007696151733398, + "sampling/importance_sampling_ratio/min": 0.621555745601654, + "sampling/sampling_logp_difference/max": 0.4800872802734375, + "sampling/sampling_logp_difference/mean": 0.012712078168988228, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 146.125, + "completions/mean_terminated_length": 146.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.20366084575653076, + "epoch": 1.5441176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6610212922375305, + "kl": 0.11102088540792465, + "learning_rate": 5.635852993170052e-07, + "loss": 0.0038, + "num_tokens": 39925158.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.630755066871643, + "sampling/importance_sampling_ratio/mean": 1.000718116760254, + "sampling/importance_sampling_ratio/min": 0.47591328620910645, + "sampling/sampling_logp_difference/max": 0.7425196170806885, + "sampling/sampling_logp_difference/mean": 0.013576138764619827, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 240.703125, + "completions/mean_terminated_length": 240.703125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2457953691482544, + "epoch": 1.545343137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5482759188590858, + "kl": 0.07239291071891785, + "learning_rate": 5.628786396402013e-07, + "loss": -0.017, + "num_tokens": 39959203.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 1.000040888786316, + "sampling/importance_sampling_ratio/min": 0.4159519374370575, + "sampling/sampling_logp_difference/max": 0.877185583114624, + "sampling/sampling_logp_difference/mean": 0.013064755126833916, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 129.765625, + "completions/mean_terminated_length": 129.765625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.19108673930168152, + "epoch": 1.5465686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09128008485690275, + "kl": 0.0935688316822052, + "learning_rate": 5.621718523237426e-07, + "loss": 0.0009, + "num_tokens": 39983556.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002806186676025, + "sampling/importance_sampling_ratio/min": 0.6650000214576721, + "sampling/sampling_logp_difference/max": 0.771212100982666, + "sampling/sampling_logp_difference/mean": 0.011321095749735832, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 215.4375, + "completions/mean_terminated_length": 215.4375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.23867498338222504, + "epoch": 1.5477941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9279048547268585, + "kl": 0.09833469241857529, + "learning_rate": 5.614649388023622e-07, + "loss": -0.0002, + "num_tokens": 40014288.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6091666221618652, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.549793004989624, + "sampling/sampling_logp_difference/max": 0.5982134342193604, + "sampling/sampling_logp_difference/mean": 0.01338406465947628, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 193.109375, + "completions/mean_terminated_length": 193.109375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.21172797679901123, + "epoch": 1.5490196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3127184525500883, + "kl": 0.06100928783416748, + "learning_rate": 5.607579005110502e-07, + "loss": 0.0006, + "num_tokens": 40041975.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9005506038665771, + "sampling/importance_sampling_ratio/mean": 1.0003749132156372, + "sampling/importance_sampling_ratio/min": 0.29519444704055786, + "sampling/sampling_logp_difference/max": 1.220120906829834, + "sampling/sampling_logp_difference/mean": 0.012064306065440178, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 169.578125, + "completions/mean_terminated_length": 169.578125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2743019461631775, + "epoch": 1.5502450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4961160389061916, + "kl": 0.10028813779354095, + "learning_rate": 5.60050738885049e-07, + "loss": 0.0185, + "num_tokens": 40070188.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997214674949646, + "sampling/importance_sampling_ratio/min": 0.4946571886539459, + "sampling/sampling_logp_difference/max": 0.7074887752532959, + "sampling/sampling_logp_difference/mean": 0.01720958948135376, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 182.734375, + "completions/mean_terminated_length": 182.734375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3171706795692444, + "epoch": 1.5514705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4790588340111015, + "kl": 0.14416320621967316, + "learning_rate": 5.593434553598525e-07, + "loss": -0.0161, + "num_tokens": 40101547.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5588680505752563, + "sampling/importance_sampling_ratio/mean": 0.9998143911361694, + "sampling/importance_sampling_ratio/min": 0.4989607632160187, + "sampling/sampling_logp_difference/max": 0.695227861404419, + "sampling/sampling_logp_difference/mean": 0.017059538513422012, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 203.75, + "completions/mean_terminated_length": 203.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.2670314908027649, + "epoch": 1.5526960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3274914775166644, + "kl": 0.08601544052362442, + "learning_rate": 5.586360513712009e-07, + "loss": 0.0017, + "num_tokens": 40129915.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000174641609192, + "sampling/importance_sampling_ratio/min": 0.5666781067848206, + "sampling/sampling_logp_difference/max": 0.7321939468383789, + "sampling/sampling_logp_difference/mean": 0.014580268412828445, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 218.40625, + "completions/mean_terminated_length": 218.40625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.28639858961105347, + "epoch": 1.553921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0743242514633502, + "kl": 0.0894109308719635, + "learning_rate": 5.579285283550797e-07, + "loss": -0.0179, + "num_tokens": 40163701.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.9880058765411377, + "sampling/importance_sampling_ratio/mean": 1.0004611015319824, + "sampling/importance_sampling_ratio/min": 0.5505126714706421, + "sampling/sampling_logp_difference/max": 0.6871321201324463, + "sampling/sampling_logp_difference/mean": 0.01633625663816929, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 163.984375, + "completions/mean_terminated_length": 163.984375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.23159533739089966, + "epoch": 1.5551470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3265368422339745, + "kl": 0.1146799698472023, + "learning_rate": 5.572208877477159e-07, + "loss": -0.0101, + "num_tokens": 40193972.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4586268663406372, + "sampling/importance_sampling_ratio/mean": 0.9998776912689209, + "sampling/importance_sampling_ratio/min": 0.6181763410568237, + "sampling/sampling_logp_difference/max": 0.48098158836364746, + "sampling/sampling_logp_difference/mean": 0.013385292142629623, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 128.265625, + "completions/mean_terminated_length": 128.265625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.2298588752746582, + "epoch": 1.5563725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1456240778612018, + "kl": 0.11662641167640686, + "learning_rate": 5.565131309855752e-07, + "loss": 0.0011, + "num_tokens": 40220181.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.695549726486206, + "sampling/importance_sampling_ratio/mean": 0.9998950958251953, + "sampling/importance_sampling_ratio/min": 0.3276721239089966, + "sampling/sampling_logp_difference/max": 1.1157417297363281, + "sampling/sampling_logp_difference/mean": 0.015842556953430176, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 213.21875, + "completions/mean_terminated_length": 213.21875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.26580965518951416, + "epoch": 1.5575980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9841841882140552, + "kl": 0.10150258243083954, + "learning_rate": 5.558052595053586e-07, + "loss": -0.0212, + "num_tokens": 40257843.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993384480476379, + "sampling/importance_sampling_ratio/min": 0.23381738364696503, + "sampling/sampling_logp_difference/max": 1.4532148838043213, + "sampling/sampling_logp_difference/mean": 0.01650913991034031, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 211.765625, + "completions/mean_terminated_length": 211.765625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.22360765933990479, + "epoch": 1.5588235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09318137774395309, + "kl": 0.07635696977376938, + "learning_rate": 5.550972747440005e-07, + "loss": 0.0007, + "num_tokens": 40285636.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8329914808273315, + "sampling/importance_sampling_ratio/mean": 1.0001411437988281, + "sampling/importance_sampling_ratio/min": 0.568603515625, + "sampling/sampling_logp_difference/max": 0.6059494018554688, + "sampling/sampling_logp_difference/mean": 0.012909941375255585, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 238.90625, + "completions/mean_terminated_length": 238.90625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3557601571083069, + "epoch": 1.5600490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.56944518339622, + "kl": 0.11926597356796265, + "learning_rate": 5.543891781386655e-07, + "loss": -0.0501, + "num_tokens": 40326462.0, + "reward": 0.625, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.8657264709472656, + "sampling/importance_sampling_ratio/mean": 1.0002565383911133, + "sampling/importance_sampling_ratio/min": 0.46485185623168945, + "sampling/sampling_logp_difference/max": 0.7660365104675293, + "sampling/sampling_logp_difference/mean": 0.016536138951778412, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 192.703125, + "completions/mean_terminated_length": 192.703125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.21355998516082764, + "epoch": 1.5612745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.042234641612648, + "kl": 0.08831916004419327, + "learning_rate": 5.536809711267443e-07, + "loss": -0.011, + "num_tokens": 40354635.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.8180792331695557, + "sampling/importance_sampling_ratio/mean": 1.0000646114349365, + "sampling/importance_sampling_ratio/min": 0.5598354339599609, + "sampling/sampling_logp_difference/max": 0.5977805852890015, + "sampling/sampling_logp_difference/mean": 0.01221003569662571, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 194.125, + "completions/mean_terminated_length": 194.125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.2758662700653076, + "epoch": 1.5625, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4188679796807888, + "kl": 0.10724423825740814, + "learning_rate": 5.529726551458526e-07, + "loss": 0.01, + "num_tokens": 40387651.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8446329832077026, + "sampling/importance_sampling_ratio/mean": 0.9991621375083923, + "sampling/importance_sampling_ratio/min": 0.4571169912815094, + "sampling/sampling_logp_difference/max": 0.7828159332275391, + "sampling/sampling_logp_difference/mean": 0.016088467091321945, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 223.421875, + "completions/mean_terminated_length": 223.421875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2723897695541382, + "epoch": 1.5637254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1141962541942754, + "kl": 0.08754530549049377, + "learning_rate": 5.522642316338268e-07, + "loss": -0.0267, + "num_tokens": 40419886.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003609657287598, + "sampling/importance_sampling_ratio/min": 0.32915252447128296, + "sampling/sampling_logp_difference/max": 1.1112340688705444, + "sampling/sampling_logp_difference/mean": 0.01671537756919861, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 158.203125, + "completions/mean_terminated_length": 158.203125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2390233278274536, + "epoch": 1.5649509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0850297375053766, + "kl": 0.11868917942047119, + "learning_rate": 5.515557020287218e-07, + "loss": 0.0012, + "num_tokens": 40445563.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001310110092163, + "sampling/importance_sampling_ratio/min": 0.619220495223999, + "sampling/sampling_logp_difference/max": 0.8923323154449463, + "sampling/sampling_logp_difference/mean": 0.014304285869002342, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 217.515625, + "completions/mean_terminated_length": 217.515625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.2527675926685333, + "epoch": 1.5661764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.002817954155109, + "kl": 0.10649637132883072, + "learning_rate": 5.508470677688078e-07, + "loss": -0.0023, + "num_tokens": 40478076.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.821816086769104, + "sampling/importance_sampling_ratio/mean": 1.0001246929168701, + "sampling/importance_sampling_ratio/min": 0.6048331260681152, + "sampling/sampling_logp_difference/max": 0.5998338460922241, + "sampling/sampling_logp_difference/mean": 0.014033079147338867, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 191.71875, + "completions/mean_terminated_length": 191.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3498360812664032, + "epoch": 1.5674019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.817895853693584, + "kl": 0.14529192447662354, + "learning_rate": 5.501383302925677e-07, + "loss": 0.0287, + "num_tokens": 40513050.0, + "reward": 0.375, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.8661153316497803, + "sampling/importance_sampling_ratio/mean": 1.000024437904358, + "sampling/importance_sampling_ratio/min": 0.6282871961593628, + "sampling/sampling_logp_difference/max": 0.6238589286804199, + "sampling/sampling_logp_difference/mean": 0.01771906018257141, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 237.015625, + "completions/mean_terminated_length": 237.015625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.29353150725364685, + "epoch": 1.5686274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22491306039810763, + "kl": 0.09252274036407471, + "learning_rate": 5.494294910386933e-07, + "loss": 0.001, + "num_tokens": 40551275.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5715757608413696, + "sampling/importance_sampling_ratio/mean": 1.0004040002822876, + "sampling/importance_sampling_ratio/min": 0.4678581655025482, + "sampling/sampling_logp_difference/max": 0.7595901489257812, + "sampling/sampling_logp_difference/mean": 0.01582920365035534, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 129.234375, + "completions/mean_terminated_length": 129.234375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.23700666427612305, + "epoch": 1.5698529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9577397856869958, + "kl": 0.12990131974220276, + "learning_rate": 5.487205514460835e-07, + "loss": -0.0058, + "num_tokens": 40576010.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6820436716079712, + "sampling/importance_sampling_ratio/mean": 1.0001105070114136, + "sampling/importance_sampling_ratio/min": 0.5166517496109009, + "sampling/sampling_logp_difference/max": 0.6603862643241882, + "sampling/sampling_logp_difference/mean": 0.015840716660022736, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 163.578125, + "completions/mean_terminated_length": 163.578125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.20090463757514954, + "epoch": 1.571078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05936568691447249, + "kl": 0.07548060268163681, + "learning_rate": 5.480115129538409e-07, + "loss": 0.0008, + "num_tokens": 40605375.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.598211407661438, + "sampling/importance_sampling_ratio/mean": 0.9995618462562561, + "sampling/importance_sampling_ratio/min": 0.4885278046131134, + "sampling/sampling_logp_difference/max": 0.7163589000701904, + "sampling/sampling_logp_difference/mean": 0.012459134683012962, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 168.65625, + "completions/mean_terminated_length": 168.65625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.23894764482975006, + "epoch": 1.5723039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4373800788773192, + "kl": 0.07795411348342896, + "learning_rate": 5.473023770012686e-07, + "loss": -0.0108, + "num_tokens": 40632025.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.959810495376587, + "sampling/importance_sampling_ratio/mean": 1.000944972038269, + "sampling/importance_sampling_ratio/min": 0.4056454598903656, + "sampling/sampling_logp_difference/max": 0.902275800704956, + "sampling/sampling_logp_difference/mean": 0.01591445505619049, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 201.109375, + "completions/mean_terminated_length": 201.109375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2994091510772705, + "epoch": 1.5735294117647058, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.485785633874617, + "kl": 0.12637823820114136, + "learning_rate": 5.465931450278676e-07, + "loss": -0.0054, + "num_tokens": 40664384.0, + "reward": 0.78125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6253334283828735, + "sampling/importance_sampling_ratio/mean": 0.9995725154876709, + "sampling/importance_sampling_ratio/min": 0.5292443037033081, + "sampling/sampling_logp_difference/max": 0.6363050937652588, + "sampling/sampling_logp_difference/mean": 0.015157992951571941, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 264.28125, + "completions/mean_terminated_length": 264.28125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.3000125586986542, + "epoch": 1.5747549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1394094039805573, + "kl": 0.12689156830310822, + "learning_rate": 5.458838184733341e-07, + "loss": 0.0098, + "num_tokens": 40697474.0, + "reward": -0.40625, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.8945499658584595, + "sampling/importance_sampling_ratio/mean": 1.0002498626708984, + "sampling/importance_sampling_ratio/min": 0.5997557044029236, + "sampling/sampling_logp_difference/max": 0.6389813423156738, + "sampling/sampling_logp_difference/mean": 0.015428531914949417, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 187.046875, + "completions/mean_terminated_length": 187.046875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.22485093772411346, + "epoch": 1.5759803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6563343521050014, + "kl": 0.06725899875164032, + "learning_rate": 5.451743987775559e-07, + "loss": 0.0147, + "num_tokens": 40728741.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999529719352722, + "sampling/importance_sampling_ratio/min": 0.4202858805656433, + "sampling/sampling_logp_difference/max": 0.8668200969696045, + "sampling/sampling_logp_difference/mean": 0.014246209524571896, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 223.328125, + "completions/mean_terminated_length": 223.328125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2730158567428589, + "epoch": 1.5772058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0062192756669384, + "kl": 0.10583344101905823, + "learning_rate": 5.444648873806101e-07, + "loss": -0.0233, + "num_tokens": 40758074.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.406072735786438, + "sampling/importance_sampling_ratio/mean": 0.9998959302902222, + "sampling/importance_sampling_ratio/min": 0.46186599135398865, + "sampling/sampling_logp_difference/max": 0.7724804878234863, + "sampling/sampling_logp_difference/mean": 0.014256740920245647, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 187.921875, + "completions/mean_terminated_length": 187.921875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.28058645129203796, + "epoch": 1.5784313725490198, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9967848781737365, + "kl": 0.09992186725139618, + "learning_rate": 5.437552857227597e-07, + "loss": 0.0307, + "num_tokens": 40786661.0, + "reward": 0.59375, + "reward_std": 0.5457825064659119, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4743789434432983, + "sampling/importance_sampling_ratio/mean": 1.0000240802764893, + "sampling/importance_sampling_ratio/min": 0.5766149759292603, + "sampling/sampling_logp_difference/max": 0.5505805015563965, + "sampling/sampling_logp_difference/mean": 0.013886423781514168, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 246.75, + "completions/mean_terminated_length": 246.75, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3286219537258148, + "epoch": 1.579656862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4992127432975813, + "kl": 0.11583884805440903, + "learning_rate": 5.430455952444512e-07, + "loss": 0.046, + "num_tokens": 40814549.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6112518310546875, + "sampling/importance_sampling_ratio/mean": 0.9998191595077515, + "sampling/importance_sampling_ratio/min": 0.5836060047149658, + "sampling/sampling_logp_difference/max": 0.5385291576385498, + "sampling/sampling_logp_difference/mean": 0.016217375174164772, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 257.609375, + "completions/mean_terminated_length": 257.609375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.28263741731643677, + "epoch": 1.5808823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4290550984192945, + "kl": 0.10155349224805832, + "learning_rate": 5.423358173863116e-07, + "loss": 0.0061, + "num_tokens": 40848812.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.7165179252624512, + "sampling/importance_sampling_ratio/mean": 0.9999136328697205, + "sampling/importance_sampling_ratio/min": 0.5624218583106995, + "sampling/sampling_logp_difference/max": 0.5755031108856201, + "sampling/sampling_logp_difference/mean": 0.014271766878664494, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 213.53125, + "completions/mean_terminated_length": 213.53125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.24739715456962585, + "epoch": 1.5821078431372548, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6734622190020279, + "kl": 0.09649119526147842, + "learning_rate": 5.416259535891446e-07, + "loss": 0.04, + "num_tokens": 40879598.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995496869087219, + "sampling/importance_sampling_ratio/min": 0.3670720160007477, + "sampling/sampling_logp_difference/max": 1.0069791078567505, + "sampling/sampling_logp_difference/mean": 0.013408919796347618, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 215.5625, + "completions/mean_terminated_length": 215.5625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.294612318277359, + "epoch": 1.5833333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8663318814916061, + "kl": 0.1407470703125, + "learning_rate": 5.409160052939291e-07, + "loss": -0.0069, + "num_tokens": 40915298.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 0.38391175866127014, + "sampling/sampling_logp_difference/max": 0.9573426246643066, + "sampling/sampling_logp_difference/mean": 0.01669587939977646, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 171.25, + "completions/mean_terminated_length": 171.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.25864478945732117, + "epoch": 1.5845588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6736988394049386, + "kl": 0.08632921427488327, + "learning_rate": 5.402059739418148e-07, + "loss": 0.1311, + "num_tokens": 40942978.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006288290023804, + "sampling/importance_sampling_ratio/min": 0.6288881301879883, + "sampling/sampling_logp_difference/max": 0.7209138870239258, + "sampling/sampling_logp_difference/mean": 0.014864898286759853, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 260.15625, + "completions/mean_terminated_length": 260.15625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.25275641679763794, + "epoch": 1.5857843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2194443009765528, + "kl": 0.0801507979631424, + "learning_rate": 5.394958609741206e-07, + "loss": -0.0058, + "num_tokens": 40975436.0, + "reward": -0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997862577438354, + "sampling/importance_sampling_ratio/min": 0.6255087852478027, + "sampling/sampling_logp_difference/max": 0.7695990800857544, + "sampling/sampling_logp_difference/mean": 0.012196059338748455, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 250.15625, + "completions/mean_terminated_length": 250.15625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3004721701145172, + "epoch": 1.5870098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2984374525467763, + "kl": 0.1429024338722229, + "learning_rate": 5.387856678323307e-07, + "loss": 0.0101, + "num_tokens": 41009478.0, + "reward": 0.34375, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5772258043289185, + "sampling/importance_sampling_ratio/mean": 1.000220537185669, + "sampling/importance_sampling_ratio/min": 0.559878408908844, + "sampling/sampling_logp_difference/max": 0.5800356268882751, + "sampling/sampling_logp_difference/mean": 0.015601429156959057, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 167.65625, + "completions/mean_terminated_length": 167.65625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.1998165249824524, + "epoch": 1.5882352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06245472818406181, + "kl": 0.0781020000576973, + "learning_rate": 5.380753959580922e-07, + "loss": 0.0008, + "num_tokens": 41049824.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9509587287902832, + "sampling/importance_sampling_ratio/mean": 1.0004262924194336, + "sampling/importance_sampling_ratio/min": 0.5226418375968933, + "sampling/sampling_logp_difference/max": 0.668320894241333, + "sampling/sampling_logp_difference/mean": 0.012588824145495892, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 244.046875, + "completions/mean_terminated_length": 244.046875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3870052993297577, + "epoch": 1.5894607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1356174564170538, + "kl": 0.1631091684103012, + "learning_rate": 5.373650467932121e-07, + "loss": -0.0164, + "num_tokens": 41083331.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6629855632781982, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 0.6024278998374939, + "sampling/sampling_logp_difference/max": 0.5086145401000977, + "sampling/sampling_logp_difference/mean": 0.017559468746185303, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 178.03125, + "completions/mean_terminated_length": 178.03125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.24488691985607147, + "epoch": 1.590686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05664842750606466, + "kl": 0.07084640115499496, + "learning_rate": 5.366546217796541e-07, + "loss": 0.0007, + "num_tokens": 41115637.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9388110637664795, + "sampling/importance_sampling_ratio/mean": 1.000507116317749, + "sampling/importance_sampling_ratio/min": 0.5753761529922485, + "sampling/sampling_logp_difference/max": 0.6620749235153198, + "sampling/sampling_logp_difference/mean": 0.014486636035144329, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 258.1875, + "completions/mean_terminated_length": 258.1875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.3009330928325653, + "epoch": 1.5919117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6563035468953373, + "kl": 0.0957925021648407, + "learning_rate": 5.359441223595363e-07, + "loss": -0.0652, + "num_tokens": 41152337.0, + "reward": 0.5, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6631168127059937, + "sampling/importance_sampling_ratio/mean": 0.9993993043899536, + "sampling/importance_sampling_ratio/min": 0.5552826523780823, + "sampling/sampling_logp_difference/max": 0.5882779359817505, + "sampling/sampling_logp_difference/mean": 0.013483626767992973, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 219.859375, + "completions/mean_terminated_length": 219.859375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.28012022376060486, + "epoch": 1.593137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5183671380069574, + "kl": 0.13128457963466644, + "learning_rate": 5.352335499751269e-07, + "loss": -0.0276, + "num_tokens": 41182680.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.9445205926895142, + "sampling/importance_sampling_ratio/mean": 1.0000536441802979, + "sampling/importance_sampling_ratio/min": 0.5100371837615967, + "sampling/sampling_logp_difference/max": 0.673271656036377, + "sampling/sampling_logp_difference/mean": 0.01482788659632206, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 252.359375, + "completions/mean_terminated_length": 252.359375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.2718067765235901, + "epoch": 1.594362745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5306560736696353, + "kl": 0.0820138156414032, + "learning_rate": 5.345229060688433e-07, + "loss": -0.0155, + "num_tokens": 41218367.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6451834440231323, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.5406036972999573, + "sampling/sampling_logp_difference/max": 0.6150689125061035, + "sampling/sampling_logp_difference/mean": 0.014146863482892513, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 177.34375, + "completions/mean_terminated_length": 177.34375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.22269929945468903, + "epoch": 1.5955882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3923351940283724, + "kl": 0.07856234908103943, + "learning_rate": 5.338121920832475e-07, + "loss": 0.0008, + "num_tokens": 41247365.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.890487790107727, + "sampling/importance_sampling_ratio/mean": 0.9993278980255127, + "sampling/importance_sampling_ratio/min": 0.5250731706619263, + "sampling/sampling_logp_difference/max": 0.6442176103591919, + "sampling/sampling_logp_difference/mean": 0.014698393642902374, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 193.828125, + "completions/mean_terminated_length": 193.828125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3115875720977783, + "epoch": 1.596813725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7693350321536974, + "kl": 0.1558687388896942, + "learning_rate": 5.331014094610438e-07, + "loss": 0.0104, + "num_tokens": 41275018.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6519043445587158, + "sampling/importance_sampling_ratio/mean": 0.9997758865356445, + "sampling/importance_sampling_ratio/min": 0.6171634197235107, + "sampling/sampling_logp_difference/max": 0.5019288063049316, + "sampling/sampling_logp_difference/mean": 0.016039744019508362, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 184.890625, + "completions/mean_terminated_length": 184.890625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.23528572916984558, + "epoch": 1.5980392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2081387886376294, + "kl": 0.0981862023472786, + "learning_rate": 5.323905596450759e-07, + "loss": 0.0365, + "num_tokens": 41304643.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004469156265259, + "sampling/importance_sampling_ratio/min": 0.4046587347984314, + "sampling/sampling_logp_difference/max": 1.098386287689209, + "sampling/sampling_logp_difference/mean": 0.01493294071406126, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 212.859375, + "completions/mean_terminated_length": 212.859375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.27819180488586426, + "epoch": 1.5992647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1589790752226412, + "kl": 0.10933251678943634, + "learning_rate": 5.31679644078324e-07, + "loss": -0.0048, + "num_tokens": 41333914.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6308337450027466, + "sampling/importance_sampling_ratio/mean": 0.99986732006073, + "sampling/importance_sampling_ratio/min": 0.5278540849685669, + "sampling/sampling_logp_difference/max": 0.6389353275299072, + "sampling/sampling_logp_difference/mean": 0.014791795052587986, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 180.953125, + "completions/mean_terminated_length": 180.953125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.19707581400871277, + "epoch": 1.6004901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07642399662364885, + "kl": 0.09464478492736816, + "learning_rate": 5.309686642039015e-07, + "loss": 0.0009, + "num_tokens": 41361063.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.62743079662323, + "sampling/importance_sampling_ratio/mean": 1.0002444982528687, + "sampling/importance_sampling_ratio/min": 0.41710734367370605, + "sampling/sampling_logp_difference/max": 0.8744117021560669, + "sampling/sampling_logp_difference/mean": 0.013895422220230103, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 202.453125, + "completions/mean_terminated_length": 202.453125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.20806419849395752, + "epoch": 1.6017156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04285113876154974, + "kl": 0.07179662585258484, + "learning_rate": 5.302576214650527e-07, + "loss": 0.0006, + "num_tokens": 41393028.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8008705377578735, + "sampling/importance_sampling_ratio/mean": 1.000395655632019, + "sampling/importance_sampling_ratio/min": 0.49519240856170654, + "sampling/sampling_logp_difference/max": 0.7028088569641113, + "sampling/sampling_logp_difference/mean": 0.01474726665765047, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 194.96875, + "completions/mean_terminated_length": 194.96875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.1974889189004898, + "epoch": 1.6029411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4297146563298104, + "kl": 0.08590207993984222, + "learning_rate": 5.295465173051491e-07, + "loss": 0.077, + "num_tokens": 41423906.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.755322813987732, + "sampling/importance_sampling_ratio/mean": 0.9999452233314514, + "sampling/importance_sampling_ratio/min": 0.6205163598060608, + "sampling/sampling_logp_difference/max": 0.5626528263092041, + "sampling/sampling_logp_difference/mean": 0.012646855786442757, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 228.6875, + "completions/mean_terminated_length": 228.6875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.31509193778038025, + "epoch": 1.6041666666666665, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8576238928473223, + "kl": 0.12904363870620728, + "learning_rate": 5.288353531676873e-07, + "loss": -0.0124, + "num_tokens": 41454750.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5882036685943604, + "sampling/importance_sampling_ratio/mean": 0.9996026754379272, + "sampling/importance_sampling_ratio/min": 0.5271825194358826, + "sampling/sampling_logp_difference/max": 0.6402084827423096, + "sampling/sampling_logp_difference/mean": 0.015689942985773087, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 140.96875, + "completions/mean_terminated_length": 140.96875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.1562458574771881, + "epoch": 1.6053921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09737824294354848, + "kl": 0.09509014338254929, + "learning_rate": 5.281241304962852e-07, + "loss": 0.0009, + "num_tokens": 41480060.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5647554397583008, + "sampling/importance_sampling_ratio/mean": 0.999330461025238, + "sampling/importance_sampling_ratio/min": 0.44270896911621094, + "sampling/sampling_logp_difference/max": 0.814842700958252, + "sampling/sampling_logp_difference/mean": 0.01225283369421959, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 164.140625, + "completions/mean_terminated_length": 164.140625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.17961689829826355, + "epoch": 1.6066176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12433184014965502, + "kl": 0.12926530838012695, + "learning_rate": 5.2741285073468e-07, + "loss": 0.0012, + "num_tokens": 41516757.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4351998567581177, + "sampling/importance_sampling_ratio/mean": 0.9998909831047058, + "sampling/importance_sampling_ratio/min": 0.4443853497505188, + "sampling/sampling_logp_difference/max": 0.8110632300376892, + "sampling/sampling_logp_difference/mean": 0.011371592059731483, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 190.703125, + "completions/mean_terminated_length": 190.703125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.2711417078971863, + "epoch": 1.607843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.7073808572881557, + "kl": 0.15485966205596924, + "learning_rate": 5.267015153267245e-07, + "loss": 0.0367, + "num_tokens": 41548786.0, + "reward": 0.59375, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005667209625244, + "sampling/importance_sampling_ratio/min": 0.4746485650539398, + "sampling/sampling_logp_difference/max": 0.8474371433258057, + "sampling/sampling_logp_difference/mean": 0.016850367188453674, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 183.28125, + "completions/mean_terminated_length": 183.28125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.25501537322998047, + "epoch": 1.6090686274509802, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.093866343919497, + "kl": 0.11585363000631332, + "learning_rate": 5.259901257163844e-07, + "loss": 0.0053, + "num_tokens": 41577060.0, + "reward": 0.6875, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6257047653198242, + "sampling/importance_sampling_ratio/mean": 0.9992328882217407, + "sampling/importance_sampling_ratio/min": 0.4216938018798828, + "sampling/sampling_logp_difference/max": 0.8634757995605469, + "sampling/sampling_logp_difference/mean": 0.01534545049071312, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 133.21875, + "completions/mean_terminated_length": 133.21875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2237296998500824, + "epoch": 1.6102941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055933878857765205, + "kl": 0.07522813230752945, + "learning_rate": 5.252786833477358e-07, + "loss": 0.0007, + "num_tokens": 41605218.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.558457612991333, + "sampling/importance_sampling_ratio/mean": 0.9998308420181274, + "sampling/importance_sampling_ratio/min": 0.48925769329071045, + "sampling/sampling_logp_difference/max": 0.7148659229278564, + "sampling/sampling_logp_difference/mean": 0.016113966703414917, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 259.734375, + "completions/mean_terminated_length": 259.734375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.24126943945884705, + "epoch": 1.6115196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9630807576324676, + "kl": 0.08669568598270416, + "learning_rate": 5.245671896649612e-07, + "loss": 0.0099, + "num_tokens": 41640865.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997607469558716, + "sampling/importance_sampling_ratio/min": 0.506397008895874, + "sampling/sampling_logp_difference/max": 1.1126689910888672, + "sampling/sampling_logp_difference/mean": 0.012919275090098381, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 242.28125, + "completions/mean_terminated_length": 242.28125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.227101132273674, + "epoch": 1.6127450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03724252908873951, + "kl": 0.06919814646244049, + "learning_rate": 5.23855646112348e-07, + "loss": 0.0006, + "num_tokens": 41671667.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9311466217041016, + "sampling/importance_sampling_ratio/mean": 0.9998438358306885, + "sampling/importance_sampling_ratio/min": 0.6051800847053528, + "sampling/sampling_logp_difference/max": 0.658113956451416, + "sampling/sampling_logp_difference/mean": 0.013156576082110405, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 270.453125, + "completions/mean_terminated_length": 270.453125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.30856984853744507, + "epoch": 1.6139705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1497588769228289, + "kl": 0.09006384760141373, + "learning_rate": 5.231440541342845e-07, + "loss": -0.0173, + "num_tokens": 41704480.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5483697652816772, + "sampling/importance_sampling_ratio/mean": 1.0000381469726562, + "sampling/importance_sampling_ratio/min": 0.5144515037536621, + "sampling/sampling_logp_difference/max": 0.664654016494751, + "sampling/sampling_logp_difference/mean": 0.015054329298436642, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 201.21875, + "completions/mean_terminated_length": 201.21875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.18170005083084106, + "epoch": 1.6151960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05194959550465986, + "kl": 0.06692922860383987, + "learning_rate": 5.224324151752575e-07, + "loss": 0.0007, + "num_tokens": 41737742.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000636339187622, + "sampling/importance_sampling_ratio/min": 0.3987736403942108, + "sampling/sampling_logp_difference/max": 0.9193613529205322, + "sampling/sampling_logp_difference/mean": 0.011669599451124668, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 197.078125, + "completions/mean_terminated_length": 197.078125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.29809680581092834, + "epoch": 1.616421568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6056980751564378, + "kl": 0.09784778952598572, + "learning_rate": 5.217207306798487e-07, + "loss": -0.0102, + "num_tokens": 41767475.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003911256790161, + "sampling/importance_sampling_ratio/min": 0.6152052879333496, + "sampling/sampling_logp_difference/max": 0.8550095558166504, + "sampling/sampling_logp_difference/mean": 0.01622714102268219, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 157.53125, + "completions/mean_terminated_length": 157.53125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.20960770547389984, + "epoch": 1.6176470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2581966492870358, + "kl": 0.07081106305122375, + "learning_rate": 5.210090020927326e-07, + "loss": -0.0059, + "num_tokens": 41795717.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000015497207642, + "sampling/importance_sampling_ratio/min": 0.5700021982192993, + "sampling/sampling_logp_difference/max": 1.0141346454620361, + "sampling/sampling_logp_difference/mean": 0.013170900754630566, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 205.828125, + "completions/mean_terminated_length": 205.828125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.24730682373046875, + "epoch": 1.6188725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052956459955202366, + "kl": 0.08272158354520798, + "learning_rate": 5.202972308586735e-07, + "loss": 0.0008, + "num_tokens": 41831658.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994138479232788, + "sampling/importance_sampling_ratio/min": 0.5493014454841614, + "sampling/sampling_logp_difference/max": 0.7052476406097412, + "sampling/sampling_logp_difference/mean": 0.015009921044111252, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 212.0625, + "completions/mean_terminated_length": 212.0625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.21910163760185242, + "epoch": 1.6200980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05277614432556432, + "kl": 0.0917169526219368, + "learning_rate": 5.195854184225213e-07, + "loss": 0.0009, + "num_tokens": 41863502.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6503266096115112, + "sampling/importance_sampling_ratio/mean": 1.0001317262649536, + "sampling/importance_sampling_ratio/min": 0.5389890670776367, + "sampling/sampling_logp_difference/max": 0.6180599927902222, + "sampling/sampling_logp_difference/mean": 0.012834159657359123, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 183.421875, + "completions/mean_terminated_length": 183.421875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.22714023292064667, + "epoch": 1.6213235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06480517359148961, + "kl": 0.08311962336301804, + "learning_rate": 5.188735662292107e-07, + "loss": 0.0008, + "num_tokens": 41891081.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6613457202911377, + "sampling/importance_sampling_ratio/mean": 1.0001468658447266, + "sampling/importance_sampling_ratio/min": 0.47708699107170105, + "sampling/sampling_logp_difference/max": 0.7400565147399902, + "sampling/sampling_logp_difference/mean": 0.01521068811416626, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 177.078125, + "completions/mean_terminated_length": 177.078125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.2419014871120453, + "epoch": 1.6225490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0061410801489905, + "kl": 0.10726620256900787, + "learning_rate": 5.181616757237561e-07, + "loss": 0.034, + "num_tokens": 41917582.0, + "reward": 0.71875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.9397830963134766, + "sampling/importance_sampling_ratio/mean": 0.9992237687110901, + "sampling/importance_sampling_ratio/min": 0.5363001227378845, + "sampling/sampling_logp_difference/max": 0.6625761985778809, + "sampling/sampling_logp_difference/mean": 0.014850424602627754, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 220.28125, + "completions/mean_terminated_length": 220.28125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.2679051160812378, + "epoch": 1.6237745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1707494264096197, + "kl": 0.10701704770326614, + "learning_rate": 5.174497483512505e-07, + "loss": -0.023, + "num_tokens": 41949152.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.627264380455017, + "sampling/importance_sampling_ratio/mean": 0.9996857643127441, + "sampling/importance_sampling_ratio/min": 0.4389006197452545, + "sampling/sampling_logp_difference/max": 0.8234822750091553, + "sampling/sampling_logp_difference/mean": 0.015957824885845184, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 166.578125, + "completions/mean_terminated_length": 166.578125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.23362596333026886, + "epoch": 1.625, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4933756703267782, + "kl": 0.0941019356250763, + "learning_rate": 5.167377855568612e-07, + "loss": 0.0157, + "num_tokens": 41980773.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.673134684562683, + "sampling/importance_sampling_ratio/mean": 0.999595046043396, + "sampling/importance_sampling_ratio/min": 0.5383061766624451, + "sampling/sampling_logp_difference/max": 0.6193277835845947, + "sampling/sampling_logp_difference/mean": 0.014315936714410782, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 200.015625, + "completions/mean_terminated_length": 200.015625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.29066771268844604, + "epoch": 1.6262254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.454954066410852, + "kl": 0.10306625068187714, + "learning_rate": 5.160257887858277e-07, + "loss": -0.0341, + "num_tokens": 42019382.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001888275146484, + "sampling/importance_sampling_ratio/min": 0.2848387658596039, + "sampling/sampling_logp_difference/max": 1.2558319568634033, + "sampling/sampling_logp_difference/mean": 0.01684502884745598, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 172.640625, + "completions/mean_terminated_length": 172.640625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.21323218941688538, + "epoch": 1.6274509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5167060367394343, + "kl": 0.07055232673883438, + "learning_rate": 5.15313759483458e-07, + "loss": 0.1168, + "num_tokens": 42045663.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.808783769607544, + "sampling/importance_sampling_ratio/mean": 1.0004042387008667, + "sampling/importance_sampling_ratio/min": 0.5261567831039429, + "sampling/sampling_logp_difference/max": 0.6421560049057007, + "sampling/sampling_logp_difference/mean": 0.013901956379413605, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 171.5, + "completions/mean_terminated_length": 171.5, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3145783245563507, + "epoch": 1.6286764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4238544507154678, + "kl": 0.142796128988266, + "learning_rate": 5.146016990951268e-07, + "loss": -0.0195, + "num_tokens": 42075567.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992316365242004, + "sampling/importance_sampling_ratio/min": 0.24553349614143372, + "sampling/sampling_logp_difference/max": 1.4043219089508057, + "sampling/sampling_logp_difference/mean": 0.017758475616574287, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 188.28125, + "completions/mean_terminated_length": 188.28125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2112322449684143, + "epoch": 1.6299019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051044574497776975, + "kl": 0.07780525088310242, + "learning_rate": 5.138896090662714e-07, + "loss": 0.0007, + "num_tokens": 42107761.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8814224004745483, + "sampling/importance_sampling_ratio/mean": 1.0002202987670898, + "sampling/importance_sampling_ratio/min": 0.48437368869781494, + "sampling/sampling_logp_difference/max": 0.7248985767364502, + "sampling/sampling_logp_difference/mean": 0.0132973063737154, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 200.0625, + "completions/mean_terminated_length": 200.0625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2907071113586426, + "epoch": 1.6311274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.599292742810804, + "kl": 0.11069557815790176, + "learning_rate": 5.131774908423898e-07, + "loss": 0.0143, + "num_tokens": 42136021.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8153657913208008, + "sampling/importance_sampling_ratio/mean": 0.9995972514152527, + "sampling/importance_sampling_ratio/min": 0.48853954672813416, + "sampling/sampling_logp_difference/max": 0.7163348197937012, + "sampling/sampling_logp_difference/mean": 0.016580259427428246, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 195.109375, + "completions/mean_terminated_length": 195.109375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.23820869624614716, + "epoch": 1.6323529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054324159375675335, + "kl": 0.07167470455169678, + "learning_rate": 5.124653458690365e-07, + "loss": 0.0007, + "num_tokens": 42166828.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6654973030090332, + "sampling/importance_sampling_ratio/mean": 0.9998932480812073, + "sampling/importance_sampling_ratio/min": 0.5910135507583618, + "sampling/sampling_logp_difference/max": 0.525916337966919, + "sampling/sampling_logp_difference/mean": 0.014437150210142136, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 181.84375, + "completions/mean_terminated_length": 181.84375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.22880329191684723, + "epoch": 1.633578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05272930597249983, + "kl": 0.08534187078475952, + "learning_rate": 5.117531755918207e-07, + "loss": 0.0008, + "num_tokens": 42194562.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5075865983963013, + "sampling/importance_sampling_ratio/mean": 0.9995811581611633, + "sampling/importance_sampling_ratio/min": 0.5517650842666626, + "sampling/sampling_logp_difference/max": 0.5946328639984131, + "sampling/sampling_logp_difference/mean": 0.013283336535096169, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 187.6875, + "completions/mean_terminated_length": 187.6875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.22426791489124298, + "epoch": 1.6348039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0707230428521295, + "kl": 0.09419456869363785, + "learning_rate": 5.110409814564031e-07, + "loss": 0.0009, + "num_tokens": 42229310.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5982849597930908, + "sampling/importance_sampling_ratio/mean": 0.999759316444397, + "sampling/importance_sampling_ratio/min": 0.5677502155303955, + "sampling/sampling_logp_difference/max": 0.5660736560821533, + "sampling/sampling_logp_difference/mean": 0.013859902508556843, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 183.890625, + "completions/mean_terminated_length": 183.890625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3586990237236023, + "epoch": 1.6360294117647058, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3049812417751974, + "kl": 0.158889502286911, + "learning_rate": 5.103287649084926e-07, + "loss": -0.0103, + "num_tokens": 42258151.0, + "reward": 0.34375, + "reward_std": 0.676956295967102, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6215449571609497, + "sampling/importance_sampling_ratio/mean": 0.9993545413017273, + "sampling/importance_sampling_ratio/min": 0.40965768694877625, + "sampling/sampling_logp_difference/max": 0.8924334049224854, + "sampling/sampling_logp_difference/mean": 0.018728770315647125, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 165.9375, + "completions/mean_terminated_length": 165.9375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.18399828672409058, + "epoch": 1.6372549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5395292659691353, + "kl": 0.08521635830402374, + "learning_rate": 5.096165273938435e-07, + "loss": 0.0204, + "num_tokens": 42285651.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9603852033615112, + "sampling/importance_sampling_ratio/mean": 1.0003844499588013, + "sampling/importance_sampling_ratio/min": 0.6114168167114258, + "sampling/sampling_logp_difference/max": 0.6731410026550293, + "sampling/sampling_logp_difference/mean": 0.011999163776636124, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 175.109375, + "completions/mean_terminated_length": 175.109375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2937619090080261, + "epoch": 1.6384803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.034304557509628, + "kl": 0.15264631807804108, + "learning_rate": 5.089042703582533e-07, + "loss": 0.0115, + "num_tokens": 42314202.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.7399550676345825, + "sampling/importance_sampling_ratio/mean": 0.9997891187667847, + "sampling/importance_sampling_ratio/min": 0.44116657972335815, + "sampling/sampling_logp_difference/max": 0.8183327317237854, + "sampling/sampling_logp_difference/mean": 0.016184452921152115, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 212.71875, + "completions/mean_terminated_length": 212.71875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3052104115486145, + "epoch": 1.6397058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2976913823008105, + "kl": 0.11412011086940765, + "learning_rate": 5.081919952475583e-07, + "loss": 0.0008, + "num_tokens": 42351208.0, + "reward": 0.5, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5723400115966797, + "sampling/importance_sampling_ratio/mean": 0.9997768402099609, + "sampling/importance_sampling_ratio/min": 0.44444528222084045, + "sampling/sampling_logp_difference/max": 0.8109283447265625, + "sampling/sampling_logp_difference/mean": 0.017344534397125244, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 190.578125, + "completions/mean_terminated_length": 190.578125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2736814618110657, + "epoch": 1.6409313725490198, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8251000890125455, + "kl": 0.08616019040346146, + "learning_rate": 5.074797035076318e-07, + "loss": -0.0145, + "num_tokens": 42378077.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.8718525171279907, + "sampling/importance_sampling_ratio/mean": 1.000880241394043, + "sampling/importance_sampling_ratio/min": 0.6115580201148987, + "sampling/sampling_logp_difference/max": 0.6269285678863525, + "sampling/sampling_logp_difference/mean": 0.014431033283472061, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 228.140625, + "completions/mean_terminated_length": 228.140625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.2970915138721466, + "epoch": 1.642156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9868492072910282, + "kl": 0.0832802876830101, + "learning_rate": 5.067673965843812e-07, + "loss": -0.011, + "num_tokens": 42409382.0, + "reward": 0.4375, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99962317943573, + "sampling/importance_sampling_ratio/min": 0.2602560520172119, + "sampling/sampling_logp_difference/max": 1.3460893630981445, + "sampling/sampling_logp_difference/mean": 0.015678174793720245, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 250.53125, + "completions/mean_terminated_length": 250.53125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.26596561074256897, + "epoch": 1.6433823529411766, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7119751113149253, + "kl": 0.07231054455041885, + "learning_rate": 5.060550759237441e-07, + "loss": -0.0751, + "num_tokens": 42441896.0, + "reward": 0.5625, + "reward_std": 0.5915650129318237, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006382465362549, + "sampling/importance_sampling_ratio/min": 0.17431412637233734, + "sampling/sampling_logp_difference/max": 1.7468962669372559, + "sampling/sampling_logp_difference/mean": 0.01413971371948719, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 190.1875, + "completions/mean_terminated_length": 190.1875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.24031300842761993, + "epoch": 1.6446078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.333026429331567, + "kl": 0.08541548997163773, + "learning_rate": 5.053427429716866e-07, + "loss": 0.0076, + "num_tokens": 42473668.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7207510471343994, + "sampling/importance_sampling_ratio/mean": 1.000162124633789, + "sampling/importance_sampling_ratio/min": 0.5210290551185608, + "sampling/sampling_logp_difference/max": 0.6519495248794556, + "sampling/sampling_logp_difference/mean": 0.014477924443781376, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 197.921875, + "completions/mean_terminated_length": 197.921875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.25727108120918274, + "epoch": 1.6458333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.016201733244008, + "kl": 0.09245032072067261, + "learning_rate": 5.046303991741993e-07, + "loss": -0.0402, + "num_tokens": 42503807.0, + "reward": 0.71875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.6272691488265991, + "sampling/importance_sampling_ratio/mean": 0.9992700815200806, + "sampling/importance_sampling_ratio/min": 0.6096329092979431, + "sampling/sampling_logp_difference/max": 0.49489831924438477, + "sampling/sampling_logp_difference/mean": 0.013924137689173222, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 207.1875, + "completions/mean_terminated_length": 207.1875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3347078263759613, + "epoch": 1.6470588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06318886619615308, + "kl": 0.11795598268508911, + "learning_rate": 5.039180459772949e-07, + "loss": 0.0012, + "num_tokens": 42534891.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001790523529053, + "sampling/importance_sampling_ratio/min": 0.4022471308708191, + "sampling/sampling_logp_difference/max": 0.9106886386871338, + "sampling/sampling_logp_difference/mean": 0.018503038212656975, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 136.9375, + "completions/mean_terminated_length": 136.9375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.21099574863910675, + "epoch": 1.6482843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06328075753077166, + "kl": 0.08794344216585159, + "learning_rate": 5.032056848270056e-07, + "loss": 0.0009, + "num_tokens": 42558279.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.561264991760254, + "sampling/importance_sampling_ratio/mean": 0.9999595880508423, + "sampling/importance_sampling_ratio/min": 0.613577663898468, + "sampling/sampling_logp_difference/max": 0.48844844102859497, + "sampling/sampling_logp_difference/mean": 0.013360563665628433, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 157.625, + "completions/mean_terminated_length": 157.625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.20779091119766235, + "epoch": 1.6495098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2777246815639125, + "kl": 0.09795111417770386, + "learning_rate": 5.02493317169379e-07, + "loss": -0.0148, + "num_tokens": 42583055.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7928379774093628, + "sampling/importance_sampling_ratio/mean": 1.0002166032791138, + "sampling/importance_sampling_ratio/min": 0.5677189826965332, + "sampling/sampling_logp_difference/max": 0.5837998390197754, + "sampling/sampling_logp_difference/mean": 0.013950050808489323, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 157.734375, + "completions/mean_terminated_length": 157.734375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.23696336150169373, + "epoch": 1.6507352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059585662552681225, + "kl": 0.10689768195152283, + "learning_rate": 5.017809444504767e-07, + "loss": 0.0011, + "num_tokens": 42612366.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6983774900436401, + "sampling/importance_sampling_ratio/mean": 1.0000231266021729, + "sampling/importance_sampling_ratio/min": 0.5634658336639404, + "sampling/sampling_logp_difference/max": 0.5736486911773682, + "sampling/sampling_logp_difference/mean": 0.015318666584789753, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 236.890625, + "completions/mean_terminated_length": 236.890625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.23486870527267456, + "epoch": 1.6519607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1154581101547827, + "kl": 0.07553314417600632, + "learning_rate": 5.010685681163698e-07, + "loss": 0.0561, + "num_tokens": 42647559.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000604391098022, + "sampling/importance_sampling_ratio/min": 0.3292165994644165, + "sampling/sampling_logp_difference/max": 1.111039400100708, + "sampling/sampling_logp_difference/mean": 0.013525919988751411, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 184.1875, + "completions/mean_terminated_length": 184.1875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3114980161190033, + "epoch": 1.653186274509804, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2259090560583856, + "kl": 0.11155524849891663, + "learning_rate": 5.003561896131374e-07, + "loss": 0.0145, + "num_tokens": 42680403.0, + "reward": 0.8125, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5709562301635742, + "sampling/importance_sampling_ratio/mean": 0.9993911385536194, + "sampling/importance_sampling_ratio/min": 0.6183300614356995, + "sampling/sampling_logp_difference/max": 0.4807329773902893, + "sampling/sampling_logp_difference/mean": 0.01637909933924675, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 203.78125, + "completions/mean_terminated_length": 203.78125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.23865506052970886, + "epoch": 1.6544117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04475247985952317, + "kl": 0.07838784903287888, + "learning_rate": 4.996438103868625e-07, + "loss": 0.0007, + "num_tokens": 42712293.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7526777982711792, + "sampling/importance_sampling_ratio/mean": 1.0003411769866943, + "sampling/importance_sampling_ratio/min": 0.6225943565368652, + "sampling/sampling_logp_difference/max": 0.5611448287963867, + "sampling/sampling_logp_difference/mean": 0.014617128297686577, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 203.96875, + "completions/mean_terminated_length": 203.96875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2744845449924469, + "epoch": 1.655637254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7408160022259205, + "kl": 0.11861857771873474, + "learning_rate": 4.989314318836302e-07, + "loss": -0.0126, + "num_tokens": 42741907.0, + "reward": 0.125, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.54710054397583, + "sampling/importance_sampling_ratio/mean": 0.999839186668396, + "sampling/importance_sampling_ratio/min": 0.5887571573257446, + "sampling/sampling_logp_difference/max": 0.5297415256500244, + "sampling/sampling_logp_difference/mean": 0.016750134527683258, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 150.28125, + "completions/mean_terminated_length": 150.28125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.18784059584140778, + "epoch": 1.656862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8598833573971085, + "kl": 0.13575217127799988, + "learning_rate": 4.982190555495235e-07, + "loss": -0.012, + "num_tokens": 42765061.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.7080327272415161, + "sampling/importance_sampling_ratio/mean": 0.9997662901878357, + "sampling/importance_sampling_ratio/min": 0.5367580056190491, + "sampling/sampling_logp_difference/max": 0.6222078800201416, + "sampling/sampling_logp_difference/mean": 0.013174982741475105, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 183.921875, + "completions/mean_terminated_length": 183.921875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2240990251302719, + "epoch": 1.6580882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.86401773438621, + "kl": 0.08727099746465683, + "learning_rate": 4.975066828306209e-07, + "loss": -0.0095, + "num_tokens": 42792416.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005711317062378, + "sampling/importance_sampling_ratio/min": 0.4520740509033203, + "sampling/sampling_logp_difference/max": 0.8031885623931885, + "sampling/sampling_logp_difference/mean": 0.015043207444250584, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 162.65625, + "completions/mean_terminated_length": 162.65625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2405478060245514, + "epoch": 1.659313725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.242041603566026, + "kl": 0.16278581321239471, + "learning_rate": 4.967943151729944e-07, + "loss": 0.0576, + "num_tokens": 42817242.0, + "reward": 0.59375, + "reward_std": 0.497555673122406, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6590238809585571, + "sampling/importance_sampling_ratio/mean": 0.9999129772186279, + "sampling/importance_sampling_ratio/min": 0.5087666511535645, + "sampling/sampling_logp_difference/max": 0.6757657527923584, + "sampling/sampling_logp_difference/mean": 0.013905617408454418, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 160.71875, + "completions/mean_terminated_length": 160.71875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.1898893117904663, + "epoch": 1.6605392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3933831662992229, + "kl": 0.07735336571931839, + "learning_rate": 4.96081954022705e-07, + "loss": -0.0671, + "num_tokens": 42842168.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.403991460800171, + "sampling/importance_sampling_ratio/mean": 0.9996185302734375, + "sampling/importance_sampling_ratio/min": 0.5681336522102356, + "sampling/sampling_logp_difference/max": 0.5653985738754272, + "sampling/sampling_logp_difference/mean": 0.011938979849219322, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 169.375, + "completions/mean_terminated_length": 169.375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.2664642632007599, + "epoch": 1.6617647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1981326528665563, + "kl": 0.09241677820682526, + "learning_rate": 4.953696008258008e-07, + "loss": 0.0475, + "num_tokens": 42868064.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6814212799072266, + "sampling/importance_sampling_ratio/mean": 1.0005993843078613, + "sampling/importance_sampling_ratio/min": 0.5139349699020386, + "sampling/sampling_logp_difference/max": 0.6656584739685059, + "sampling/sampling_logp_difference/mean": 0.015398381277918816, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 161.796875, + "completions/mean_terminated_length": 161.796875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2700577974319458, + "epoch": 1.6629901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.531257986742057, + "kl": 0.2919192314147949, + "learning_rate": 4.946572570283134e-07, + "loss": 0.0126, + "num_tokens": 42894803.0, + "reward": 0.59375, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6232237815856934, + "sampling/importance_sampling_ratio/mean": 0.9996989965438843, + "sampling/importance_sampling_ratio/min": 0.013000335544347763, + "sampling/sampling_logp_difference/max": 4.342780113220215, + "sampling/sampling_logp_difference/mean": 0.01691490039229393, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 165.25, + "completions/mean_terminated_length": 165.25, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.339409738779068, + "epoch": 1.6642156862745097, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8952017692855232, + "kl": 0.12340100109577179, + "learning_rate": 4.939449240762558e-07, + "loss": -0.0421, + "num_tokens": 42922131.0, + "reward": 0.71875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000413417816162, + "sampling/importance_sampling_ratio/min": 0.4595952332019806, + "sampling/sampling_logp_difference/max": 0.7774090766906738, + "sampling/sampling_logp_difference/mean": 0.01715453900396824, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 145.953125, + "completions/mean_terminated_length": 145.953125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.23316159844398499, + "epoch": 1.6654411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0963002270490013, + "kl": 0.10303225368261337, + "learning_rate": 4.932326034156189e-07, + "loss": 0.001, + "num_tokens": 42950576.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6010249853134155, + "sampling/importance_sampling_ratio/mean": 1.0004855394363403, + "sampling/importance_sampling_ratio/min": 0.5264350175857544, + "sampling/sampling_logp_difference/max": 0.641627311706543, + "sampling/sampling_logp_difference/mean": 0.01539541594684124, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 153.203125, + "completions/mean_terminated_length": 153.203125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.18769334256649017, + "epoch": 1.6666666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.613985052269561, + "kl": 0.09362373501062393, + "learning_rate": 4.925202964923683e-07, + "loss": 0.0005, + "num_tokens": 42974941.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999897480010986, + "sampling/importance_sampling_ratio/min": 0.5657834410667419, + "sampling/sampling_logp_difference/max": 0.7541909217834473, + "sampling/sampling_logp_difference/mean": 0.014007753692567348, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 219.8125, + "completions/mean_terminated_length": 219.8125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.25269484519958496, + "epoch": 1.6678921568627452, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5837754600410165, + "kl": 0.09320004284381866, + "learning_rate": 4.918080047524417e-07, + "loss": 0.0256, + "num_tokens": 43004049.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003734827041626, + "sampling/importance_sampling_ratio/min": 0.6183974146842957, + "sampling/sampling_logp_difference/max": 0.7329139709472656, + "sampling/sampling_logp_difference/mean": 0.014377745799720287, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 142.515625, + "completions/mean_terminated_length": 142.515625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2503388822078705, + "epoch": 1.6691176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19241528737319635, + "kl": 0.17062516510486603, + "learning_rate": 4.910957296417467e-07, + "loss": 0.0019, + "num_tokens": 43025442.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.693943977355957, + "sampling/importance_sampling_ratio/mean": 0.999955415725708, + "sampling/importance_sampling_ratio/min": 0.4887165129184723, + "sampling/sampling_logp_difference/max": 0.7159726619720459, + "sampling/sampling_logp_difference/mean": 0.01492222212255001, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 193.578125, + "completions/mean_terminated_length": 193.578125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.30455151200294495, + "epoch": 1.670343137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6876242542682152, + "kl": 0.16102200746536255, + "learning_rate": 4.903834726061564e-07, + "loss": -0.0021, + "num_tokens": 43059143.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998186826705933, + "sampling/importance_sampling_ratio/min": 0.5035651922225952, + "sampling/sampling_logp_difference/max": 0.7266943454742432, + "sampling/sampling_logp_difference/mean": 0.017101185396313667, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 174.03125, + "completions/mean_terminated_length": 174.03125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.24065756797790527, + "epoch": 1.6715686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1289676912772164, + "kl": 0.09058039635419846, + "learning_rate": 4.896712350915074e-07, + "loss": 0.0009, + "num_tokens": 43095993.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998894333839417, + "sampling/importance_sampling_ratio/min": 0.4640738368034363, + "sampling/sampling_logp_difference/max": 0.7677116394042969, + "sampling/sampling_logp_difference/mean": 0.01515038963407278, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 191.78125, + "completions/mean_terminated_length": 191.78125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.21339699625968933, + "epoch": 1.6727941176470589, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5234317958528387, + "kl": 0.09759987890720367, + "learning_rate": 4.889590185435969e-07, + "loss": -0.0205, + "num_tokens": 43127371.0, + "reward": 0.375, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.85578191280365, + "sampling/importance_sampling_ratio/mean": 0.9998263120651245, + "sampling/importance_sampling_ratio/min": 0.5853095054626465, + "sampling/sampling_logp_difference/max": 0.6183061599731445, + "sampling/sampling_logp_difference/mean": 0.013212048448622227, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 200.125, + "completions/mean_terminated_length": 200.125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4170733094215393, + "epoch": 1.6740196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0924402915304783, + "kl": 0.18056243658065796, + "learning_rate": 4.882468244081792e-07, + "loss": 0.0004, + "num_tokens": 43163827.0, + "reward": 0.46875, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6581690311431885, + "sampling/importance_sampling_ratio/mean": 1.0002093315124512, + "sampling/importance_sampling_ratio/min": 0.5444231629371643, + "sampling/sampling_logp_difference/max": 0.6080284118652344, + "sampling/sampling_logp_difference/mean": 0.020893795415759087, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 242.46875, + "completions/mean_terminated_length": 242.46875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.17147576808929443, + "epoch": 1.6752450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0212249478750737, + "kl": 0.10019063949584961, + "learning_rate": 4.875346541309636e-07, + "loss": 0.0077, + "num_tokens": 43197793.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.646515130996704, + "sampling/importance_sampling_ratio/mean": 1.000142216682434, + "sampling/importance_sampling_ratio/min": 0.617138683795929, + "sampling/sampling_logp_difference/max": 0.4986610412597656, + "sampling/sampling_logp_difference/mean": 0.0103620495647192, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 233.5625, + "completions/mean_terminated_length": 233.5625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.2481188178062439, + "epoch": 1.6764705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3466141791726334, + "kl": 0.07373310625553131, + "learning_rate": 4.868225091576102e-07, + "loss": 0.0171, + "num_tokens": 43230949.0, + "reward": 0.3125, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.4394497871398926, + "sampling/importance_sampling_ratio/mean": 0.9995990991592407, + "sampling/importance_sampling_ratio/min": 0.5695202350616455, + "sampling/sampling_logp_difference/max": 0.5629609823226929, + "sampling/sampling_logp_difference/mean": 0.013606008142232895, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 177.359375, + "completions/mean_terminated_length": 177.359375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.28148430585861206, + "epoch": 1.6776960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9725140313637766, + "kl": 0.10703254491090775, + "learning_rate": 4.861103909337285e-07, + "loss": 0.0291, + "num_tokens": 43260828.0, + "reward": 0.5, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004048347473145, + "sampling/importance_sampling_ratio/min": 0.09236104041337967, + "sampling/sampling_logp_difference/max": 2.382050037384033, + "sampling/sampling_logp_difference/mean": 0.014763194136321545, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 194.328125, + "completions/mean_terminated_length": 194.328125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.289120078086853, + "epoch": 1.678921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.09266729126875, + "kl": 0.11325451731681824, + "learning_rate": 4.853983009048732e-07, + "loss": -0.0346, + "num_tokens": 43293713.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5475056171417236, + "sampling/importance_sampling_ratio/mean": 1.0003206729888916, + "sampling/importance_sampling_ratio/min": 0.5568208694458008, + "sampling/sampling_logp_difference/max": 0.5855116844177246, + "sampling/sampling_logp_difference/mean": 0.015658417716622353, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 210.890625, + "completions/mean_terminated_length": 210.890625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3526790142059326, + "epoch": 1.6801470588235294, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0867761324539056, + "kl": 0.15972676873207092, + "learning_rate": 4.84686240516542e-07, + "loss": 0.0229, + "num_tokens": 43323930.0, + "reward": 0.3125, + "reward_std": 0.6143567562103271, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5467427968978882, + "sampling/importance_sampling_ratio/mean": 1.0000510215759277, + "sampling/importance_sampling_ratio/min": 0.47393912076950073, + "sampling/sampling_logp_difference/max": 0.7466764450073242, + "sampling/sampling_logp_difference/mean": 0.017478249967098236, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 200.375, + "completions/mean_terminated_length": 200.375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.20949649810791016, + "epoch": 1.6813725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2638909903023694, + "kl": 0.06713837385177612, + "learning_rate": 4.839742112141724e-07, + "loss": 0.0452, + "num_tokens": 43354642.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001473426818848, + "sampling/importance_sampling_ratio/min": 0.4337219297885895, + "sampling/sampling_logp_difference/max": 0.8726010322570801, + "sampling/sampling_logp_difference/mean": 0.01308098528534174, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 190.109375, + "completions/mean_terminated_length": 190.109375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.29467177391052246, + "epoch": 1.6825980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8858091141384608, + "kl": 0.11413323134183884, + "learning_rate": 4.832622144431388e-07, + "loss": 0.0203, + "num_tokens": 43385577.0, + "reward": 0.5, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5071555376052856, + "sampling/importance_sampling_ratio/mean": 0.9998645186424255, + "sampling/importance_sampling_ratio/min": 0.28268373012542725, + "sampling/sampling_logp_difference/max": 1.2634265422821045, + "sampling/sampling_logp_difference/mean": 0.01592201367020607, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 206.546875, + "completions/mean_terminated_length": 206.546875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3291833996772766, + "epoch": 1.6838235294117647, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9178705450875981, + "kl": 0.12914146482944489, + "learning_rate": 4.825502516487496e-07, + "loss": 0.0166, + "num_tokens": 43418764.0, + "reward": -0.125, + "reward_std": 0.6663130521774292, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6523675918579102, + "sampling/importance_sampling_ratio/mean": 1.0004007816314697, + "sampling/importance_sampling_ratio/min": 0.6056222319602966, + "sampling/sampling_logp_difference/max": 0.5022091865539551, + "sampling/sampling_logp_difference/mean": 0.01776885613799095, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 203.453125, + "completions/mean_terminated_length": 203.453125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.3313523530960083, + "epoch": 1.6850490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8827964833937998, + "kl": 0.14949117600917816, + "learning_rate": 4.818383242762439e-07, + "loss": 0.0148, + "num_tokens": 43456601.0, + "reward": 0.875, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8197872638702393, + "sampling/importance_sampling_ratio/mean": 0.9996486902236938, + "sampling/importance_sampling_ratio/min": 0.5531119704246521, + "sampling/sampling_logp_difference/max": 0.598719596862793, + "sampling/sampling_logp_difference/mean": 0.017888199537992477, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 195.15625, + "completions/mean_terminated_length": 195.15625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2908930778503418, + "epoch": 1.6862745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.004422595938958, + "kl": 0.16500240564346313, + "learning_rate": 4.811264337707894e-07, + "loss": -0.0089, + "num_tokens": 43484275.0, + "reward": 0.15625, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.603110432624817, + "sampling/importance_sampling_ratio/mean": 1.0009064674377441, + "sampling/importance_sampling_ratio/min": 0.6171850562095642, + "sampling/sampling_logp_difference/max": 0.4825863838195801, + "sampling/sampling_logp_difference/mean": 0.014188327826559544, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 178.0, + "completions/mean_terminated_length": 178.0, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.2866644859313965, + "epoch": 1.6875, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.153546925697768, + "kl": 0.17025037109851837, + "learning_rate": 4.804145815774786e-07, + "loss": 0.0083, + "num_tokens": 43515667.0, + "reward": 0.65625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002154111862183, + "sampling/importance_sampling_ratio/min": 0.342887818813324, + "sampling/sampling_logp_difference/max": 1.0703519582748413, + "sampling/sampling_logp_difference/mean": 0.01696593686938286, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 181.03125, + "completions/mean_terminated_length": 181.03125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.29743701219558716, + "epoch": 1.6887254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0662312476330906, + "kl": 0.1554834097623825, + "learning_rate": 4.797027691413267e-07, + "loss": 0.0335, + "num_tokens": 43541717.0, + "reward": -0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.580533504486084, + "sampling/importance_sampling_ratio/mean": 1.000537395477295, + "sampling/importance_sampling_ratio/min": 0.5968798398971558, + "sampling/sampling_logp_difference/max": 0.5160394906997681, + "sampling/sampling_logp_difference/mean": 0.016106370836496353, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 255.859375, + "completions/mean_terminated_length": 255.859375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.33325445652008057, + "epoch": 1.6899509803921569, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.158591515821032, + "kl": 0.11241887509822845, + "learning_rate": 4.789909979072673e-07, + "loss": -0.04, + "num_tokens": 43579564.0, + "reward": 0.25, + "reward_std": 0.9245119094848633, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.4743503332138062, + "sampling/importance_sampling_ratio/mean": 0.9997340440750122, + "sampling/importance_sampling_ratio/min": 0.5142372846603394, + "sampling/sampling_logp_difference/max": 0.6650705337524414, + "sampling/sampling_logp_difference/mean": 0.017467955127358437, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 204.3125, + "completions/mean_terminated_length": 204.3125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.35654231905937195, + "epoch": 1.6911764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9717870161455187, + "kl": 0.13017581403255463, + "learning_rate": 4.782792693201513e-07, + "loss": -0.0003, + "num_tokens": 43610064.0, + "reward": -0.53125, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": -0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.9441932439804077, + "sampling/importance_sampling_ratio/mean": 0.9997327327728271, + "sampling/importance_sampling_ratio/min": 0.42920300364494324, + "sampling/sampling_logp_difference/max": 0.8458253145217896, + "sampling/sampling_logp_difference/mean": 0.018064867705106735, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 201.0, + "completions/mean_terminated_length": 201.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.33480149507522583, + "epoch": 1.6924019607843137, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.50333582908066, + "kl": 0.1297426074743271, + "learning_rate": 4.775675848247427e-07, + "loss": 0.0687, + "num_tokens": 43641392.0, + "reward": 0.53125, + "reward_std": 0.7129635810852051, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6577258110046387, + "sampling/importance_sampling_ratio/mean": 0.999484658241272, + "sampling/importance_sampling_ratio/min": 0.48239606618881226, + "sampling/sampling_logp_difference/max": 0.7289897799491882, + "sampling/sampling_logp_difference/mean": 0.01659483090043068, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 166.203125, + "completions/mean_terminated_length": 166.203125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.24771378934383392, + "epoch": 1.6936274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3521541468766267, + "kl": 0.1873389482498169, + "learning_rate": 4.768559458657155e-07, + "loss": -0.0039, + "num_tokens": 43667037.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994951486587524, + "sampling/importance_sampling_ratio/min": 0.48710572719573975, + "sampling/sampling_logp_difference/max": 0.7192740440368652, + "sampling/sampling_logp_difference/mean": 0.015180293470621109, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 176.625, + "completions/mean_terminated_length": 176.625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.28814393281936646, + "epoch": 1.6948529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2128747355978162, + "kl": 0.09616734087467194, + "learning_rate": 4.7614435388765203e-07, + "loss": 0.0534, + "num_tokens": 43703829.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.9280849695205688, + "sampling/importance_sampling_ratio/mean": 1.0000053644180298, + "sampling/importance_sampling_ratio/min": 0.22572924196720123, + "sampling/sampling_logp_difference/max": 1.4884190559387207, + "sampling/sampling_logp_difference/mean": 0.01692255213856697, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 144.046875, + "completions/mean_terminated_length": 144.046875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2953818440437317, + "epoch": 1.696078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.204259061567527, + "kl": 0.11235234141349792, + "learning_rate": 4.7543281033503885e-07, + "loss": 0.0196, + "num_tokens": 43732008.0, + "reward": 0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.7009391784667969, + "sampling/importance_sampling_ratio/mean": 0.9996429085731506, + "sampling/importance_sampling_ratio/min": 0.6176415085792542, + "sampling/sampling_logp_difference/max": 0.5311806201934814, + "sampling/sampling_logp_difference/mean": 0.017398793250322342, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 291.375, + "completions/mean_terminated_length": 291.375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.25864797830581665, + "epoch": 1.6973039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2438733835222997, + "kl": 0.06752866506576538, + "learning_rate": 4.747213166522644e-07, + "loss": -0.005, + "num_tokens": 43768720.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.682944893836975, + "sampling/importance_sampling_ratio/mean": 0.9999276399612427, + "sampling/importance_sampling_ratio/min": 0.5483670830726624, + "sampling/sampling_logp_difference/max": 0.6008102893829346, + "sampling/sampling_logp_difference/mean": 0.012968155555427074, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 180.40625, + "completions/mean_terminated_length": 180.40625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.270290732383728, + "epoch": 1.6985294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04900553241285253, + "kl": 0.1089608296751976, + "learning_rate": 4.740098742836156e-07, + "loss": 0.001, + "num_tokens": 43794346.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5467870235443115, + "sampling/importance_sampling_ratio/mean": 0.9996156692504883, + "sampling/importance_sampling_ratio/min": 0.44426000118255615, + "sampling/sampling_logp_difference/max": 0.8113453388214111, + "sampling/sampling_logp_difference/mean": 0.013380180113017559, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 213.390625, + "completions/mean_terminated_length": 213.390625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.31980079412460327, + "epoch": 1.6997549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8001131038347111, + "kl": 0.10845265537500381, + "learning_rate": 4.732984846732755e-07, + "loss": 0.033, + "num_tokens": 43826227.0, + "reward": 0.3125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6718604564666748, + "sampling/importance_sampling_ratio/mean": 0.9998260736465454, + "sampling/importance_sampling_ratio/min": 0.4855412542819977, + "sampling/sampling_logp_difference/max": 0.7224910259246826, + "sampling/sampling_logp_difference/mean": 0.015616693533957005, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 202.984375, + "completions/mean_terminated_length": 202.984375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.1872464269399643, + "epoch": 1.7009803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04457015442774327, + "kl": 0.08273518085479736, + "learning_rate": 4.725871492653199e-07, + "loss": 0.0008, + "num_tokens": 43856914.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4354684352874756, + "sampling/importance_sampling_ratio/mean": 0.9999713897705078, + "sampling/importance_sampling_ratio/min": 0.6215065121650696, + "sampling/sampling_logp_difference/max": 0.47560882568359375, + "sampling/sampling_logp_difference/mean": 0.011172572150826454, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 189.546875, + "completions/mean_terminated_length": 189.546875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.27226871252059937, + "epoch": 1.7022058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2768027253593581, + "kl": 0.1160283237695694, + "learning_rate": 4.718758695037149e-07, + "loss": -0.0119, + "num_tokens": 43884901.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.795386552810669, + "sampling/importance_sampling_ratio/mean": 0.9995177984237671, + "sampling/importance_sampling_ratio/min": 0.617138147354126, + "sampling/sampling_logp_difference/max": 0.5852203369140625, + "sampling/sampling_logp_difference/mean": 0.015012630261480808, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 204.9375, + "completions/mean_terminated_length": 204.9375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3289707899093628, + "epoch": 1.7034313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0254271834874007, + "kl": 0.14430175721645355, + "learning_rate": 4.7116464683231285e-07, + "loss": -0.004, + "num_tokens": 43920657.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.8759516477584839, + "sampling/importance_sampling_ratio/mean": 1.0001757144927979, + "sampling/importance_sampling_ratio/min": 0.4629943072795868, + "sampling/sampling_logp_difference/max": 0.7700405120849609, + "sampling/sampling_logp_difference/mean": 0.01841399446129799, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 213.5, + "completions/mean_terminated_length": 213.5, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.33329734206199646, + "epoch": 1.704656862745098, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.951807007220008, + "kl": 0.13255050778388977, + "learning_rate": 4.704534826948509e-07, + "loss": 0.0051, + "num_tokens": 43955569.0, + "reward": 0.5625, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001225471496582, + "sampling/importance_sampling_ratio/min": 0.41612565517425537, + "sampling/sampling_logp_difference/max": 0.8767680525779724, + "sampling/sampling_logp_difference/mean": 0.016625236719846725, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 157.890625, + "completions/mean_terminated_length": 157.890625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.22026416659355164, + "epoch": 1.7058823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07321305838820764, + "kl": 0.10328660905361176, + "learning_rate": 4.6974237853494744e-07, + "loss": 0.0009, + "num_tokens": 43984746.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000003695487976, + "sampling/importance_sampling_ratio/min": 0.33506107330322266, + "sampling/sampling_logp_difference/max": 1.093442440032959, + "sampling/sampling_logp_difference/mean": 0.01616033911705017, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 188.875, + "completions/mean_terminated_length": 188.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.32607588171958923, + "epoch": 1.7071078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.169039318542256, + "kl": 0.16327345371246338, + "learning_rate": 4.690313357960985e-07, + "loss": -0.0089, + "num_tokens": 44019010.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.8102972507476807, + "sampling/importance_sampling_ratio/mean": 1.000331163406372, + "sampling/importance_sampling_ratio/min": 0.21244746446609497, + "sampling/sampling_logp_difference/max": 1.549060583114624, + "sampling/sampling_logp_difference/mean": 0.015983667224645615, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 170.59375, + "completions/mean_terminated_length": 170.59375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.30761396884918213, + "epoch": 1.7083333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6522475555481437, + "kl": 0.14691162109375, + "learning_rate": 4.68320355921676e-07, + "loss": 0.0122, + "num_tokens": 44047800.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8312194347381592, + "sampling/importance_sampling_ratio/mean": 1.0005525350570679, + "sampling/importance_sampling_ratio/min": 0.5328331589698792, + "sampling/sampling_logp_difference/max": 0.6295468807220459, + "sampling/sampling_logp_difference/mean": 0.015194841660559177, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 221.546875, + "completions/mean_terminated_length": 221.546875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.2581326961517334, + "epoch": 1.7095588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04480658682677194, + "kl": 0.0881354808807373, + "learning_rate": 4.67609440354924e-07, + "loss": 0.0008, + "num_tokens": 44082923.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6007344722747803, + "sampling/importance_sampling_ratio/mean": 1.0008583068847656, + "sampling/importance_sampling_ratio/min": 0.5205962657928467, + "sampling/sampling_logp_difference/max": 0.6527805328369141, + "sampling/sampling_logp_difference/mean": 0.014604821801185608, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 261.296875, + "completions/mean_terminated_length": 261.296875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4104100465774536, + "epoch": 1.7107843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2406398167479664, + "kl": 0.10742566734552383, + "learning_rate": 4.668985905389563e-07, + "loss": 0.0187, + "num_tokens": 44122366.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5020952224731445, + "sampling/importance_sampling_ratio/mean": 1.0001171827316284, + "sampling/importance_sampling_ratio/min": 0.6024896502494812, + "sampling/sampling_logp_difference/max": 0.5066847801208496, + "sampling/sampling_logp_difference/mean": 0.01636691950261593, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 197.125, + "completions/mean_terminated_length": 197.125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.27860528230667114, + "epoch": 1.7120098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1965616760044813, + "kl": 0.0851714164018631, + "learning_rate": 4.661878079167526e-07, + "loss": 0.0204, + "num_tokens": 44157814.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8656429052352905, + "sampling/importance_sampling_ratio/mean": 0.9995005130767822, + "sampling/importance_sampling_ratio/min": 0.48173046112060547, + "sampling/sampling_logp_difference/max": 0.7303705215454102, + "sampling/sampling_logp_difference/mean": 0.015480025671422482, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 204.171875, + "completions/mean_terminated_length": 204.171875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.24089059233665466, + "epoch": 1.7132352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3616082743869733, + "kl": 0.10055754333734512, + "learning_rate": 4.6547709393115677e-07, + "loss": 0.0101, + "num_tokens": 44186033.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.601683497428894, + "sampling/importance_sampling_ratio/mean": 0.9999518990516663, + "sampling/importance_sampling_ratio/min": 0.4732785224914551, + "sampling/sampling_logp_difference/max": 0.7480711936950684, + "sampling/sampling_logp_difference/mean": 0.014110399410128593, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 168.640625, + "completions/mean_terminated_length": 168.640625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3112737536430359, + "epoch": 1.7144607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5091896296688616, + "kl": 0.11020766198635101, + "learning_rate": 4.6476645002487295e-07, + "loss": -0.0083, + "num_tokens": 44219178.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5657960176467896, + "sampling/importance_sampling_ratio/mean": 0.9997831583023071, + "sampling/importance_sampling_ratio/min": 0.5919474959373474, + "sampling/sampling_logp_difference/max": 0.5243372917175293, + "sampling/sampling_logp_difference/mean": 0.017252368852496147, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 167.71875, + "completions/mean_terminated_length": 167.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2800918221473694, + "epoch": 1.715686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3703025697659408, + "kl": 0.1219969168305397, + "learning_rate": 4.640558776404639e-07, + "loss": 0.0302, + "num_tokens": 44252744.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5375255346298218, + "sampling/importance_sampling_ratio/mean": 0.9999685287475586, + "sampling/importance_sampling_ratio/min": 0.24667833745479584, + "sampling/sampling_logp_difference/max": 1.399670124053955, + "sampling/sampling_logp_difference/mean": 0.015770550817251205, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 175.390625, + "completions/mean_terminated_length": 175.390625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.27610814571380615, + "epoch": 1.7169117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7524755269519727, + "kl": 0.12056620419025421, + "learning_rate": 4.633453782203458e-07, + "loss": -0.0351, + "num_tokens": 44278529.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002723932266235, + "sampling/importance_sampling_ratio/min": 0.3573823869228363, + "sampling/sampling_logp_difference/max": 1.0289490222930908, + "sampling/sampling_logp_difference/mean": 0.016771188005805016, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 175.140625, + "completions/mean_terminated_length": 175.140625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3369879126548767, + "epoch": 1.718137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830706028504374, + "kl": 0.2310381531715393, + "learning_rate": 4.626349532067879e-07, + "loss": 0.0021, + "num_tokens": 44308106.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.499764084815979, + "sampling/importance_sampling_ratio/mean": 0.99969482421875, + "sampling/importance_sampling_ratio/min": 0.6117817163467407, + "sampling/sampling_logp_difference/max": 0.4913797378540039, + "sampling/sampling_logp_difference/mean": 0.016985496506094933, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 141.84375, + "completions/mean_terminated_length": 141.84375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.14914551377296448, + "epoch": 1.719362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11856309333521355, + "kl": 0.1281770020723343, + "learning_rate": 4.6192460404190793e-07, + "loss": 0.0012, + "num_tokens": 44334448.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7212589979171753, + "sampling/importance_sampling_ratio/mean": 0.9995043277740479, + "sampling/importance_sampling_ratio/min": 0.14807994663715363, + "sampling/sampling_logp_difference/max": 1.9100029468536377, + "sampling/sampling_logp_difference/mean": 0.010671250522136688, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 177.859375, + "completions/mean_terminated_length": 177.859375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.35706669092178345, + "epoch": 1.7205882352941178, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.161484119754085, + "kl": 0.1885259598493576, + "learning_rate": 4.6121433216766935e-07, + "loss": -0.0016, + "num_tokens": 44365031.0, + "reward": 0.71875, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.7002179622650146, + "sampling/importance_sampling_ratio/mean": 0.9996144771575928, + "sampling/importance_sampling_ratio/min": 0.46483224630355835, + "sampling/sampling_logp_difference/max": 0.7660787105560303, + "sampling/sampling_logp_difference/mean": 0.016989562660455704, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 136.234375, + "completions/mean_terminated_length": 136.234375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.2509397268295288, + "epoch": 1.721813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.279440526685593, + "kl": 0.11184224486351013, + "learning_rate": 4.605041390258794e-07, + "loss": 0.0056, + "num_tokens": 44391654.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4854940176010132, + "sampling/importance_sampling_ratio/mean": 1.0001810789108276, + "sampling/importance_sampling_ratio/min": 0.6172491908073425, + "sampling/sampling_logp_difference/max": 0.4824824333190918, + "sampling/sampling_logp_difference/mean": 0.013479331508278847, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 174.46875, + "completions/mean_terminated_length": 174.46875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3067903518676758, + "epoch": 1.7230392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9470053509376697, + "kl": 0.1597614288330078, + "learning_rate": 4.5979402605818514e-07, + "loss": -0.001, + "num_tokens": 44421972.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7133095264434814, + "sampling/importance_sampling_ratio/mean": 1.000095009803772, + "sampling/importance_sampling_ratio/min": 0.4271281063556671, + "sampling/sampling_logp_difference/max": 0.8506712913513184, + "sampling/sampling_logp_difference/mean": 0.016327355057001114, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 149.140625, + "completions/mean_terminated_length": 149.140625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.32089683413505554, + "epoch": 1.7242647058823528, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7019283557406544, + "kl": 0.13303419947624207, + "learning_rate": 4.5908399470607104e-07, + "loss": -0.0008, + "num_tokens": 44447981.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5467069149017334, + "sampling/importance_sampling_ratio/mean": 1.0000979900360107, + "sampling/importance_sampling_ratio/min": 0.4128660261631012, + "sampling/sampling_logp_difference/max": 0.8846321105957031, + "sampling/sampling_logp_difference/mean": 0.01682734489440918, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 198.296875, + "completions/mean_terminated_length": 198.296875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.4076573848724365, + "epoch": 1.7254901960784315, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.77545471302357, + "kl": 0.1400512307882309, + "learning_rate": 4.5837404641085535e-07, + "loss": -0.0141, + "num_tokens": 44487152.0, + "reward": 0.3125, + "reward_std": 0.6707825064659119, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.9102158546447754, + "sampling/importance_sampling_ratio/mean": 1.0003080368041992, + "sampling/importance_sampling_ratio/min": 0.36251986026763916, + "sampling/sampling_logp_difference/max": 1.0146760940551758, + "sampling/sampling_logp_difference/mean": 0.018194351345300674, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 210.78125, + "completions/mean_terminated_length": 210.78125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2296508550643921, + "epoch": 1.7267156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.108559590314233, + "kl": 0.08380186557769775, + "learning_rate": 4.576641826136884e-07, + "loss": 0.0064, + "num_tokens": 44519154.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00016188621521, + "sampling/importance_sampling_ratio/min": 0.43257567286491394, + "sampling/sampling_logp_difference/max": 0.8849180936813354, + "sampling/sampling_logp_difference/mean": 0.01322566345334053, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 156.765625, + "completions/mean_terminated_length": 156.765625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2926194369792938, + "epoch": 1.7279411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.6724582500652336, + "kl": 0.15845778584480286, + "learning_rate": 4.5695440475554864e-07, + "loss": -0.0517, + "num_tokens": 44546243.0, + "reward": 0.5, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7805469036102295, + "sampling/importance_sampling_ratio/mean": 1.0001507997512817, + "sampling/importance_sampling_ratio/min": 0.6211308836936951, + "sampling/sampling_logp_difference/max": 0.5769205093383789, + "sampling/sampling_logp_difference/mean": 0.014656851068139076, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 158.6875, + "completions/mean_terminated_length": 158.6875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3535138964653015, + "epoch": 1.7291666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13681855629173315, + "kl": 0.1603984832763672, + "learning_rate": 4.5624471427724036e-07, + "loss": 0.0015, + "num_tokens": 44570383.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5744293928146362, + "sampling/importance_sampling_ratio/mean": 0.9997444152832031, + "sampling/importance_sampling_ratio/min": 0.6240845322608948, + "sampling/sampling_logp_difference/max": 0.4714694023132324, + "sampling/sampling_logp_difference/mean": 0.017064634710550308, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 157.359375, + "completions/mean_terminated_length": 157.359375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.28453484177589417, + "epoch": 1.7303921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1375969874177505, + "kl": 0.14199012517929077, + "learning_rate": 4.5553511261939e-07, + "loss": 0.0074, + "num_tokens": 44599782.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.9123090505599976, + "sampling/importance_sampling_ratio/mean": 1.000168800354004, + "sampling/importance_sampling_ratio/min": 0.48103198409080505, + "sampling/sampling_logp_difference/max": 0.7318215370178223, + "sampling/sampling_logp_difference/mean": 0.015347521752119064, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 139.03125, + "completions/mean_terminated_length": 139.03125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.27037662267684937, + "epoch": 1.7316176470588234, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7043442480997766, + "kl": 0.17010244727134705, + "learning_rate": 4.5482560122244407e-07, + "loss": -0.0074, + "num_tokens": 44623096.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6923563480377197, + "sampling/importance_sampling_ratio/mean": 1.0002752542495728, + "sampling/importance_sampling_ratio/min": 0.6042792797088623, + "sampling/sampling_logp_difference/max": 0.5261218547821045, + "sampling/sampling_logp_difference/mean": 0.015295255929231644, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 193.203125, + "completions/mean_terminated_length": 193.203125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.271964967250824, + "epoch": 1.732843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3621892146942411, + "kl": 0.10405793786048889, + "learning_rate": 4.541161815266658e-07, + "loss": 0.0561, + "num_tokens": 44653477.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7014896869659424, + "sampling/importance_sampling_ratio/mean": 1.000382900238037, + "sampling/importance_sampling_ratio/min": 0.6019006371498108, + "sampling/sampling_logp_difference/max": 0.5315041542053223, + "sampling/sampling_logp_difference/mean": 0.014222925528883934, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 162.515625, + "completions/mean_terminated_length": 162.515625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2344822883605957, + "epoch": 1.7340686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14854398978953112, + "kl": 0.1217186450958252, + "learning_rate": 4.534068549721324e-07, + "loss": 0.0012, + "num_tokens": 44678726.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5074125528335571, + "sampling/importance_sampling_ratio/mean": 1.0000494718551636, + "sampling/importance_sampling_ratio/min": 0.5466105937957764, + "sampling/sampling_logp_difference/max": 0.6040186882019043, + "sampling/sampling_logp_difference/mean": 0.014358972199261189, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 174.375, + "completions/mean_terminated_length": 174.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2636425495147705, + "epoch": 1.7352941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4177361675699305, + "kl": 0.11610788106918335, + "learning_rate": 4.5269762299873144e-07, + "loss": 0.0153, + "num_tokens": 44711294.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.597176194190979, + "sampling/importance_sampling_ratio/mean": 0.9999326467514038, + "sampling/importance_sampling_ratio/min": 0.6056553721427917, + "sampling/sampling_logp_difference/max": 0.5014441013336182, + "sampling/sampling_logp_difference/mean": 0.014468722976744175, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 202.109375, + "completions/mean_terminated_length": 202.109375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.32975006103515625, + "epoch": 1.7365196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4751448800671598, + "kl": 0.21373483538627625, + "learning_rate": 4.519884870461591e-07, + "loss": -0.0795, + "num_tokens": 44743829.0, + "reward": 0.53125, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5812915563583374, + "sampling/importance_sampling_ratio/mean": 0.9993947148323059, + "sampling/importance_sampling_ratio/min": 0.48101913928985596, + "sampling/sampling_logp_difference/max": 0.7318482398986816, + "sampling/sampling_logp_difference/mean": 0.015227858908474445, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 134.84375, + "completions/mean_terminated_length": 134.84375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2488342523574829, + "epoch": 1.7377450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2362308456338915, + "kl": 0.15685197710990906, + "learning_rate": 4.512794485539165e-07, + "loss": -0.0226, + "num_tokens": 44766139.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7938346862792969, + "sampling/importance_sampling_ratio/mean": 0.9995754957199097, + "sampling/importance_sampling_ratio/min": 0.6154866218566895, + "sampling/sampling_logp_difference/max": 0.5843555927276611, + "sampling/sampling_logp_difference/mean": 0.012182684615254402, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 157.0, + "completions/mean_terminated_length": 157.0, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3114086389541626, + "epoch": 1.7389705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9221166858957488, + "kl": 0.13919496536254883, + "learning_rate": 4.505705089613068e-07, + "loss": -0.0021, + "num_tokens": 44792923.0, + "reward": 0.71875, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337196350098, + "sampling/importance_sampling_ratio/min": 0.6188860535621643, + "sampling/sampling_logp_difference/max": 0.7540798187255859, + "sampling/sampling_logp_difference/mean": 0.01507820375263691, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 140.96875, + "completions/mean_terminated_length": 140.96875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.2667887210845947, + "epoch": 1.7401960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06858234105509331, + "kl": 0.14108605682849884, + "learning_rate": 4.4986166970743233e-07, + "loss": 0.0015, + "num_tokens": 44816153.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3518649339675903, + "sampling/importance_sampling_ratio/mean": 1.0000407695770264, + "sampling/importance_sampling_ratio/min": 0.49824461340904236, + "sampling/sampling_logp_difference/max": 0.6966640949249268, + "sampling/sampling_logp_difference/mean": 0.01382505428045988, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 168.203125, + "completions/mean_terminated_length": 168.203125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3003990650177002, + "epoch": 1.741421568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.128264659039382, + "kl": 0.15019284188747406, + "learning_rate": 4.4915293223119205e-07, + "loss": -0.0544, + "num_tokens": 44842950.0, + "reward": 0.125, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5724730491638184, + "sampling/importance_sampling_ratio/mean": 1.0003362894058228, + "sampling/importance_sampling_ratio/min": 0.5630784630775452, + "sampling/sampling_logp_difference/max": 0.5743362903594971, + "sampling/sampling_logp_difference/mean": 0.015519457869231701, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 172.5, + "completions/mean_terminated_length": 172.5, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.40841609239578247, + "epoch": 1.7426470588235294, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3376997319413864, + "kl": 0.17027829587459564, + "learning_rate": 4.484442979712783e-07, + "loss": 0.0297, + "num_tokens": 44876934.0, + "reward": 0.6875, + "reward_std": 0.5879635810852051, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.7583552598953247, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 0.4725813865661621, + "sampling/sampling_logp_difference/max": 0.7495453357696533, + "sampling/sampling_logp_difference/mean": 0.019672289490699768, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 129.578125, + "completions/mean_terminated_length": 129.578125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.24084897339344025, + "epoch": 1.7438725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.3901074466322845, + "kl": 0.17182883620262146, + "learning_rate": 4.477357683661733e-07, + "loss": 0.0176, + "num_tokens": 44899499.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.546775221824646, + "sampling/importance_sampling_ratio/mean": 0.9997415542602539, + "sampling/importance_sampling_ratio/min": 0.6130619645118713, + "sampling/sampling_logp_difference/max": 0.4892892837524414, + "sampling/sampling_logp_difference/mean": 0.012741761282086372, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 173.28125, + "completions/mean_terminated_length": 173.28125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.28197795152664185, + "epoch": 1.7450980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05537464355682766, + "kl": 0.16134506464004517, + "learning_rate": 4.470273448541475e-07, + "loss": 0.0015, + "num_tokens": 44924973.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6356489658355713, + "sampling/importance_sampling_ratio/mean": 0.9994578957557678, + "sampling/importance_sampling_ratio/min": 0.4031654894351959, + "sampling/sampling_logp_difference/max": 0.9084081649780273, + "sampling/sampling_logp_difference/mean": 0.015645436942577362, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 203.125, + "completions/mean_terminated_length": 203.125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.30695268511772156, + "epoch": 1.7463235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4887156549415028, + "kl": 0.12615860998630524, + "learning_rate": 4.4631902887325567e-07, + "loss": -0.016, + "num_tokens": 44960245.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.7520533800125122, + "sampling/importance_sampling_ratio/mean": 1.000196099281311, + "sampling/importance_sampling_ratio/min": 0.5263950228691101, + "sampling/sampling_logp_difference/max": 0.6417033672332764, + "sampling/sampling_logp_difference/mean": 0.017607446759939194, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 192.515625, + "completions/mean_terminated_length": 192.515625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2785497307777405, + "epoch": 1.7475490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3593862911315862, + "kl": 0.116864413022995, + "learning_rate": 4.4561082186133456e-07, + "loss": 0.0255, + "num_tokens": 44985142.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5012266635894775, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.6378768682479858, + "sampling/sampling_logp_difference/max": 0.44960999488830566, + "sampling/sampling_logp_difference/mean": 0.014556299895048141, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 202.15625, + "completions/mean_terminated_length": 202.15625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.2945750951766968, + "epoch": 1.7487745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8847566894250378, + "kl": 0.10302512347698212, + "learning_rate": 4.4490272525599936e-07, + "loss": -0.0613, + "num_tokens": 45018288.0, + "reward": 0.28125, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6132599115371704, + "sampling/importance_sampling_ratio/mean": 1.0001158714294434, + "sampling/importance_sampling_ratio/min": 0.5182479619979858, + "sampling/sampling_logp_difference/max": 0.6573014259338379, + "sampling/sampling_logp_difference/mean": 0.014942665584385395, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 171.671875, + "completions/mean_terminated_length": 171.671875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2976667284965515, + "epoch": 1.75, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.687040315389125, + "kl": 0.15053917467594147, + "learning_rate": 4.4419474049464135e-07, + "loss": -0.0284, + "num_tokens": 45044187.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5078672170639038, + "sampling/importance_sampling_ratio/mean": 1.0005179643630981, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.014646529220044613, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 186.984375, + "completions/mean_terminated_length": 186.984375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.24708235263824463, + "epoch": 1.7512254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061962294927390936, + "kl": 0.13475078344345093, + "learning_rate": 4.43486869014425e-07, + "loss": 0.0013, + "num_tokens": 45077210.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.609244704246521, + "sampling/importance_sampling_ratio/mean": 0.9998207688331604, + "sampling/importance_sampling_ratio/min": 0.47399142384529114, + "sampling/sampling_logp_difference/max": 0.7465660572052002, + "sampling/sampling_logp_difference/mean": 0.013482634909451008, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 202.765625, + "completions/mean_terminated_length": 202.765625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.22805166244506836, + "epoch": 1.7524509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3470476320260578, + "kl": 0.06614042818546295, + "learning_rate": 4.427791122522841e-07, + "loss": 0.0056, + "num_tokens": 45116507.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8078150749206543, + "sampling/importance_sampling_ratio/mean": 0.9995851516723633, + "sampling/importance_sampling_ratio/min": 0.36503711342811584, + "sampling/sampling_logp_difference/max": 1.007756233215332, + "sampling/sampling_logp_difference/mean": 0.014406598173081875, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 208.5, + "completions/mean_terminated_length": 208.5, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.28755977749824524, + "epoch": 1.7536764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0000316963571345, + "kl": 0.11514320969581604, + "learning_rate": 4.420714716449203e-07, + "loss": -0.0192, + "num_tokens": 45146779.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998273253440857, + "sampling/importance_sampling_ratio/min": 0.4000128507614136, + "sampling/sampling_logp_difference/max": 0.9162585735321045, + "sampling/sampling_logp_difference/mean": 0.016624003648757935, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 205.0625, + "completions/mean_terminated_length": 205.0625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.29105067253112793, + "epoch": 1.7549019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7683078310477278, + "kl": 0.11758530139923096, + "learning_rate": 4.413639486287991e-07, + "loss": 0.0201, + "num_tokens": 45178447.0, + "reward": 0.3125, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.9833096265792847, + "sampling/importance_sampling_ratio/mean": 0.9991083145141602, + "sampling/importance_sampling_ratio/min": 0.6146351099014282, + "sampling/sampling_logp_difference/max": 0.6847670078277588, + "sampling/sampling_logp_difference/mean": 0.014459026977419853, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 199.328125, + "completions/mean_terminated_length": 199.328125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.21505844593048096, + "epoch": 1.7561274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041452161555423174, + "kl": 0.07256513088941574, + "learning_rate": 4.406565446401476e-07, + "loss": 0.0007, + "num_tokens": 45207076.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6253396272659302, + "sampling/importance_sampling_ratio/mean": 0.9993472099304199, + "sampling/importance_sampling_ratio/min": 0.5857216119766235, + "sampling/sampling_logp_difference/max": 0.5349106788635254, + "sampling/sampling_logp_difference/mean": 0.012444807216525078, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 205.359375, + "completions/mean_terminated_length": 205.359375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2595762312412262, + "epoch": 1.7573529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052811280893990606, + "kl": 0.08328672498464584, + "learning_rate": 4.399492611149509e-07, + "loss": 0.0008, + "num_tokens": 45237019.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6590856313705444, + "sampling/importance_sampling_ratio/mean": 0.9994620680809021, + "sampling/importance_sampling_ratio/min": 0.5040537714958191, + "sampling/sampling_logp_difference/max": 0.6850724220275879, + "sampling/sampling_logp_difference/mean": 0.014014007523655891, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 219.6875, + "completions/mean_terminated_length": 219.6875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.36765241622924805, + "epoch": 1.758578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5545189590979627, + "kl": 0.10093101859092712, + "learning_rate": 4.392420994889498e-07, + "loss": 0.0313, + "num_tokens": 45267895.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.667101502418518, + "sampling/importance_sampling_ratio/mean": 1.000354290008545, + "sampling/importance_sampling_ratio/min": 0.6023568511009216, + "sampling/sampling_logp_difference/max": 0.5110864639282227, + "sampling/sampling_logp_difference/mean": 0.01743512973189354, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 164.53125, + "completions/mean_terminated_length": 164.53125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 0.18837663531303406, + "epoch": 1.7598039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05479383899943296, + "kl": 0.0949767529964447, + "learning_rate": 4.385350611976376e-07, + "loss": 0.0009, + "num_tokens": 45293993.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6428558826446533, + "sampling/importance_sampling_ratio/mean": 0.999722421169281, + "sampling/importance_sampling_ratio/min": 0.5566846132278442, + "sampling/sampling_logp_difference/max": 0.5857564210891724, + "sampling/sampling_logp_difference/mean": 0.01386752724647522, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 178.765625, + "completions/mean_terminated_length": 178.765625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3317575752735138, + "epoch": 1.7610294117647058, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8074945467788956, + "kl": 0.135996013879776, + "learning_rate": 4.3782814767625755e-07, + "loss": -0.0746, + "num_tokens": 45323482.0, + "reward": -0.375, + "reward_std": 0.6681214570999146, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.8816889524459839, + "sampling/importance_sampling_ratio/mean": 1.0004677772521973, + "sampling/importance_sampling_ratio/min": 0.6718875169754028, + "sampling/sampling_logp_difference/max": 0.6321697235107422, + "sampling/sampling_logp_difference/mean": 0.016171332448720932, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 255.5, + "completions/mean_terminated_length": 255.5, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.32487937808036804, + "epoch": 1.7622549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3665938780035523, + "kl": 0.13701051473617554, + "learning_rate": 4.371213603597987e-07, + "loss": 0.0242, + "num_tokens": 45356570.0, + "reward": 0.40625, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000591278076172, + "sampling/importance_sampling_ratio/min": 0.6603941917419434, + "sampling/sampling_logp_difference/max": 0.7689223289489746, + "sampling/sampling_logp_difference/mean": 0.014964740723371506, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 224.234375, + "completions/mean_terminated_length": 224.234375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3308764696121216, + "epoch": 1.7634803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5102491486744842, + "kl": 0.1029498428106308, + "learning_rate": 4.3641470068299483e-07, + "loss": -0.0091, + "num_tokens": 45396089.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997514486312866, + "sampling/importance_sampling_ratio/min": 0.27851998805999756, + "sampling/sampling_logp_difference/max": 1.434424877166748, + "sampling/sampling_logp_difference/mean": 0.017084071412682533, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 240.765625, + "completions/mean_terminated_length": 240.765625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.330849289894104, + "epoch": 1.7647058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0874013950583177, + "kl": 0.10099180042743683, + "learning_rate": 4.3570817008032044e-07, + "loss": -0.0211, + "num_tokens": 45428874.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6062994003295898, + "sampling/importance_sampling_ratio/mean": 1.0001918077468872, + "sampling/importance_sampling_ratio/min": 0.25758519768714905, + "sampling/sampling_logp_difference/max": 1.3564047813415527, + "sampling/sampling_logp_difference/mean": 0.015466933138668537, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 141.34375, + "completions/mean_terminated_length": 141.34375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.21416430175304413, + "epoch": 1.7659313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5186219349172438, + "kl": 0.11881053447723389, + "learning_rate": 4.350017699859877e-07, + "loss": 0.0205, + "num_tokens": 45450800.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5538417100906372, + "sampling/importance_sampling_ratio/mean": 0.9992287158966064, + "sampling/importance_sampling_ratio/min": 0.6020976305007935, + "sampling/sampling_logp_difference/max": 0.5073356628417969, + "sampling/sampling_logp_difference/mean": 0.013720016926527023, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 232.84375, + "completions/mean_terminated_length": 232.84375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3054463863372803, + "epoch": 1.767156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3211377376294955, + "kl": 0.11160194873809814, + "learning_rate": 4.342955018339441e-07, + "loss": 0.0844, + "num_tokens": 45481910.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9497469663619995, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 0.5904914140701294, + "sampling/sampling_logp_difference/max": 0.6676995754241943, + "sampling/sampling_logp_difference/mean": 0.014185999520123005, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 266.09375, + "completions/mean_terminated_length": 266.09375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3187551200389862, + "epoch": 1.7683823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5081586476203768, + "kl": 0.11256249248981476, + "learning_rate": 4.335893670578694e-07, + "loss": 0.0353, + "num_tokens": 45520380.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996122717857361, + "sampling/importance_sampling_ratio/min": 0.46971988677978516, + "sampling/sampling_logp_difference/max": 0.7696003913879395, + "sampling/sampling_logp_difference/mean": 0.016480494290590286, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 166.671875, + "completions/mean_terminated_length": 166.671875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.21188709139823914, + "epoch": 1.7696078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11147503156738692, + "kl": 0.10149864852428436, + "learning_rate": 4.328833670911724e-07, + "loss": 0.001, + "num_tokens": 45546167.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7319319248199463, + "sampling/importance_sampling_ratio/mean": 0.999884307384491, + "sampling/importance_sampling_ratio/min": 0.6386851668357849, + "sampling/sampling_logp_difference/max": 0.5492374897003174, + "sampling/sampling_logp_difference/mean": 0.012810531072318554, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 185.21875, + "completions/mean_terminated_length": 185.21875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2641652226448059, + "epoch": 1.7708333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3653630721501335, + "kl": 0.10343880951404572, + "learning_rate": 4.3217750336698803e-07, + "loss": -0.0101, + "num_tokens": 45571989.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998292326927185, + "sampling/importance_sampling_ratio/min": 0.4542100131511688, + "sampling/sampling_logp_difference/max": 0.8377575874328613, + "sampling/sampling_logp_difference/mean": 0.014748496934771538, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 165.390625, + "completions/mean_terminated_length": 165.390625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.2973984181880951, + "epoch": 1.7720588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05266866935891322, + "kl": 0.11347492039203644, + "learning_rate": 4.314717773181752e-07, + "loss": 0.0011, + "num_tokens": 45601422.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5784883499145508, + "sampling/importance_sampling_ratio/mean": 0.9995951652526855, + "sampling/importance_sampling_ratio/min": 0.6271665096282959, + "sampling/sampling_logp_difference/max": 0.46654319763183594, + "sampling/sampling_logp_difference/mean": 0.015134399756789207, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 180.484375, + "completions/mean_terminated_length": 180.484375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.29683828353881836, + "epoch": 1.7732843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.341287814484405, + "kl": 0.11379870027303696, + "learning_rate": 4.3076619037731287e-07, + "loss": 0.0465, + "num_tokens": 45628669.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5927168130874634, + "sampling/importance_sampling_ratio/mean": 0.9999488592147827, + "sampling/importance_sampling_ratio/min": 0.6072784662246704, + "sampling/sampling_logp_difference/max": 0.4987678527832031, + "sampling/sampling_logp_difference/mean": 0.015673775225877762, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 287.046875, + "completions/mean_terminated_length": 287.046875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.3124946355819702, + "epoch": 1.7745098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2691119800818518, + "kl": 0.11259143054485321, + "learning_rate": 4.3006074397669836e-07, + "loss": 0.0939, + "num_tokens": 45667120.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001816749572754, + "sampling/importance_sampling_ratio/min": 0.3952656090259552, + "sampling/sampling_logp_difference/max": 0.9281973838806152, + "sampling/sampling_logp_difference/mean": 0.014476969838142395, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 244.390625, + "completions/mean_terminated_length": 244.390625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3101966977119446, + "epoch": 1.7757352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03819452015546175, + "kl": 0.08543956279754639, + "learning_rate": 4.293554395483425e-07, + "loss": 0.0008, + "num_tokens": 45705913.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6706359386444092, + "sampling/importance_sampling_ratio/mean": 1.0001120567321777, + "sampling/importance_sampling_ratio/min": 0.4687845706939697, + "sampling/sampling_logp_difference/max": 0.7576119899749756, + "sampling/sampling_logp_difference/mean": 0.016486328095197678, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 291.453125, + "completions/mean_terminated_length": 291.453125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.2985606789588928, + "epoch": 1.7769607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.915618789408616, + "kl": 0.08149291574954987, + "learning_rate": 4.2865027852396894e-07, + "loss": -0.0011, + "num_tokens": 45744630.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.7755229473114014, + "sampling/importance_sampling_ratio/mean": 0.9995817542076111, + "sampling/importance_sampling_ratio/min": 0.5869686603546143, + "sampling/sampling_logp_difference/max": 0.5740950107574463, + "sampling/sampling_logp_difference/mean": 0.014272743836045265, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 165.359375, + "completions/mean_terminated_length": 165.359375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.26212406158447266, + "epoch": 1.778186274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.291255588472374, + "kl": 0.18543842434883118, + "learning_rate": 4.2794526233501004e-07, + "loss": 0.0256, + "num_tokens": 45769853.0, + "reward": 0.0625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004011392593384, + "sampling/importance_sampling_ratio/min": 0.5027245283126831, + "sampling/sampling_logp_difference/max": 0.7940542697906494, + "sampling/sampling_logp_difference/mean": 0.013519938103854656, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 214.671875, + "completions/mean_terminated_length": 214.671875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3533676862716675, + "epoch": 1.7794117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08780767237267713, + "kl": 0.13456638157367706, + "learning_rate": 4.272403924126035e-07, + "loss": 0.0013, + "num_tokens": 45801352.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5818028450012207, + "sampling/importance_sampling_ratio/mean": 1.00015389919281, + "sampling/importance_sampling_ratio/min": 0.6185346841812134, + "sampling/sampling_logp_difference/max": 0.48040199279785156, + "sampling/sampling_logp_difference/mean": 0.016733458265662193, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 215.515625, + "completions/mean_terminated_length": 215.515625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.31544774770736694, + "epoch": 1.780637254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.75756718326626, + "kl": 0.1051800325512886, + "learning_rate": 4.2653567018759103e-07, + "loss": 0.0239, + "num_tokens": 45836313.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994698166847229, + "sampling/importance_sampling_ratio/min": 0.5134934186935425, + "sampling/sampling_logp_difference/max": 1.0599148273468018, + "sampling/sampling_logp_difference/mean": 0.016565721482038498, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 286.5, + "completions/mean_terminated_length": 286.5, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.2523324489593506, + "epoch": 1.781862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9300216698323395, + "kl": 0.07508465647697449, + "learning_rate": 4.258310970905139e-07, + "loss": -0.0007, + "num_tokens": 45875193.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6011593341827393, + "sampling/importance_sampling_ratio/mean": 1.0002411603927612, + "sampling/importance_sampling_ratio/min": 0.6262943148612976, + "sampling/sampling_logp_difference/max": 0.47072792053222656, + "sampling/sampling_logp_difference/mean": 0.012446523644030094, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 239.578125, + "completions/mean_terminated_length": 239.578125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3384774923324585, + "epoch": 1.7830882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05992620510387595, + "kl": 0.09544166922569275, + "learning_rate": 4.251266745516112e-07, + "loss": 0.001, + "num_tokens": 45913806.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5321756601333618, + "sampling/importance_sampling_ratio/mean": 1.0001329183578491, + "sampling/importance_sampling_ratio/min": 0.5305793881416321, + "sampling/sampling_logp_difference/max": 0.6337857246398926, + "sampling/sampling_logp_difference/mean": 0.015669850632548332, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 182.453125, + "completions/mean_terminated_length": 182.453125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2476593255996704, + "epoch": 1.784313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1933820000353008, + "kl": 0.12715579569339752, + "learning_rate": 4.2442240400081556e-07, + "loss": -0.0053, + "num_tokens": 45944635.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.435200810432434, + "sampling/importance_sampling_ratio/mean": 1.0001413822174072, + "sampling/importance_sampling_ratio/min": 0.44347378611564636, + "sampling/sampling_logp_difference/max": 0.8131165504455566, + "sampling/sampling_logp_difference/mean": 0.013275878503918648, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 249.453125, + "completions/mean_terminated_length": 249.453125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.19521364569664001, + "epoch": 1.7855392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1393297829970839, + "kl": 0.07648970186710358, + "learning_rate": 4.2371828686775186e-07, + "loss": 0.0046, + "num_tokens": 45981368.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6007364988327026, + "sampling/importance_sampling_ratio/mean": 1.0003199577331543, + "sampling/importance_sampling_ratio/min": 0.5038032531738281, + "sampling/sampling_logp_difference/max": 0.6855695247650146, + "sampling/sampling_logp_difference/mean": 0.01115436665713787, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 200.34375, + "completions/mean_terminated_length": 200.34375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2711257338523865, + "epoch": 1.7867647058823528, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6282354524255949, + "kl": 0.1633697897195816, + "learning_rate": 4.2301432458173316e-07, + "loss": -0.0459, + "num_tokens": 46006734.0, + "reward": 0.0, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.730678677558899, + "sampling/importance_sampling_ratio/mean": 1.0005601644515991, + "sampling/importance_sampling_ratio/min": 0.5676622986793518, + "sampling/sampling_logp_difference/max": 0.5662285089492798, + "sampling/sampling_logp_difference/mean": 0.014953596517443657, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 176.375, + "completions/mean_terminated_length": 176.375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3045671582221985, + "epoch": 1.7879901960784315, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.36006326814763, + "kl": 0.205718532204628, + "learning_rate": 4.223105185717585e-07, + "loss": 0.0279, + "num_tokens": 46032982.0, + "reward": -0.125, + "reward_std": 0.6708203554153442, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6023335456848145, + "sampling/importance_sampling_ratio/mean": 0.999557375907898, + "sampling/importance_sampling_ratio/min": 0.6203153133392334, + "sampling/sampling_logp_difference/max": 0.477527379989624, + "sampling/sampling_logp_difference/mean": 0.015139946714043617, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 200.921875, + "completions/mean_terminated_length": 200.921875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.27441561222076416, + "epoch": 1.7892156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4234743173928497, + "kl": 0.10259873420000076, + "learning_rate": 4.216068702665093e-07, + "loss": 0.0328, + "num_tokens": 46063249.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.7136951684951782, + "sampling/importance_sampling_ratio/mean": 1.0002691745758057, + "sampling/importance_sampling_ratio/min": 0.4836650490760803, + "sampling/sampling_logp_difference/max": 0.7263627052307129, + "sampling/sampling_logp_difference/mean": 0.015271389856934547, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 232.171875, + "completions/mean_terminated_length": 232.171875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.23431970179080963, + "epoch": 1.7904411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8919099955040428, + "kl": 0.07990675419569016, + "learning_rate": 4.2090338109434703e-07, + "loss": -0.0099, + "num_tokens": 46099852.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5862653255462646, + "sampling/importance_sampling_ratio/mean": 0.9999886155128479, + "sampling/importance_sampling_ratio/min": 0.5498828291893005, + "sampling/sampling_logp_difference/max": 0.5980501174926758, + "sampling/sampling_logp_difference/mean": 0.014020893722772598, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 189.625, + "completions/mean_terminated_length": 189.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.1814536303281784, + "epoch": 1.7916666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04358499048492709, + "kl": 0.06934978812932968, + "learning_rate": 4.202000524833105e-07, + "loss": 0.0007, + "num_tokens": 46132708.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4989718198776245, + "sampling/importance_sampling_ratio/mean": 0.9997729063034058, + "sampling/importance_sampling_ratio/min": 0.5376615524291992, + "sampling/sampling_logp_difference/max": 0.6205259561538696, + "sampling/sampling_logp_difference/mean": 0.011484737507998943, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 218.359375, + "completions/mean_terminated_length": 218.359375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3579706847667694, + "epoch": 1.7928921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2533118260163254, + "kl": 0.14592593908309937, + "learning_rate": 4.194968858611117e-07, + "loss": 0.0016, + "num_tokens": 46165419.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.623975396156311, + "sampling/importance_sampling_ratio/mean": 0.9996335506439209, + "sampling/importance_sampling_ratio/min": 0.5625147223472595, + "sampling/sampling_logp_difference/max": 0.5753380060195923, + "sampling/sampling_logp_difference/mean": 0.016157599166035652, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 308.015625, + "completions/mean_terminated_length": 308.015625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.264693021774292, + "epoch": 1.7941176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9742286420006561, + "kl": 0.06580743193626404, + "learning_rate": 4.187938826551346e-07, + "loss": -0.0001, + "num_tokens": 46212268.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995682835578918, + "sampling/importance_sampling_ratio/min": 0.39027827978134155, + "sampling/sampling_logp_difference/max": 0.9408953189849854, + "sampling/sampling_logp_difference/mean": 0.01413442101329565, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 162.296875, + "completions/mean_terminated_length": 162.296875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.24900653958320618, + "epoch": 1.795343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7551959821952114, + "kl": 0.13092195987701416, + "learning_rate": 4.180910442924311e-07, + "loss": -0.0141, + "num_tokens": 46236735.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7444039583206177, + "sampling/importance_sampling_ratio/mean": 1.0002326965332031, + "sampling/importance_sampling_ratio/min": 0.6056519150733948, + "sampling/sampling_logp_difference/max": 0.556412935256958, + "sampling/sampling_logp_difference/mean": 0.013616163283586502, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 205.296875, + "completions/mean_terminated_length": 205.296875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.21178001165390015, + "epoch": 1.7965686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04014984455842662, + "kl": 0.07349078357219696, + "learning_rate": 4.173883721997188e-07, + "loss": 0.0007, + "num_tokens": 46272050.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9960434436798096, + "sampling/importance_sampling_ratio/mean": 1.0001407861709595, + "sampling/importance_sampling_ratio/min": 0.532551646232605, + "sampling/sampling_logp_difference/max": 0.691166877746582, + "sampling/sampling_logp_difference/mean": 0.012803494930267334, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 221.796875, + "completions/mean_terminated_length": 221.796875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.2358531504869461, + "epoch": 1.7977941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04191345470477028, + "kl": 0.06782007962465286, + "learning_rate": 4.1668586780337713e-07, + "loss": 0.0007, + "num_tokens": 46300501.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5745536088943481, + "sampling/importance_sampling_ratio/mean": 1.00028395652771, + "sampling/importance_sampling_ratio/min": 0.4825882315635681, + "sampling/sampling_logp_difference/max": 0.7285915613174438, + "sampling/sampling_logp_difference/mean": 0.013855335302650928, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 215.875, + "completions/mean_terminated_length": 215.875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2522590458393097, + "epoch": 1.7990196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5970123198131199, + "kl": 0.13570408523082733, + "learning_rate": 4.159835325294457e-07, + "loss": -0.0156, + "num_tokens": 46327229.0, + "reward": 0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.9506787061691284, + "sampling/importance_sampling_ratio/mean": 0.9993472099304199, + "sampling/importance_sampling_ratio/min": 0.5643655061721802, + "sampling/sampling_logp_difference/max": 0.6681773662567139, + "sampling/sampling_logp_difference/mean": 0.013971049338579178, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 160.65625, + "completions/mean_terminated_length": 160.65625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2732565402984619, + "epoch": 1.8002450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1604348819027174, + "kl": 0.1681511104106903, + "learning_rate": 4.152813678036208e-07, + "loss": 0.0015, + "num_tokens": 46356087.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995669722557068, + "sampling/importance_sampling_ratio/min": 0.5160248875617981, + "sampling/sampling_logp_difference/max": 1.227365493774414, + "sampling/sampling_logp_difference/mean": 0.01550702191889286, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 217.234375, + "completions/mean_terminated_length": 217.234375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3137251138687134, + "epoch": 1.8014705882352942, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.961620072047609, + "kl": 0.13955897092819214, + "learning_rate": 4.145793750512522e-07, + "loss": -0.0159, + "num_tokens": 46386198.0, + "reward": -0.3125, + "reward_std": 0.6285127401351929, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.488340973854065, + "sampling/importance_sampling_ratio/mean": 0.9991217255592346, + "sampling/importance_sampling_ratio/min": 0.4871235191822052, + "sampling/sampling_logp_difference/max": 0.7192375659942627, + "sampling/sampling_logp_difference/mean": 0.01649576798081398, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 193.5, + "completions/mean_terminated_length": 193.5, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.27667832374572754, + "epoch": 1.8026960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6599879293237676, + "kl": 0.1454058289527893, + "learning_rate": 4.1387755569734054e-07, + "loss": -0.002, + "num_tokens": 46417110.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000545620918274, + "sampling/importance_sampling_ratio/min": 0.4651028513908386, + "sampling/sampling_logp_difference/max": 0.9523067474365234, + "sampling/sampling_logp_difference/mean": 0.015224859118461609, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 202.8125, + "completions/mean_terminated_length": 202.8125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.29587581753730774, + "epoch": 1.803921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9993185187123691, + "kl": 0.1164996549487114, + "learning_rate": 4.131759111665348e-07, + "loss": 0.0112, + "num_tokens": 46451050.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5471752882003784, + "sampling/importance_sampling_ratio/mean": 1.0005613565444946, + "sampling/importance_sampling_ratio/min": 0.5380123257637024, + "sampling/sampling_logp_difference/max": 0.6198737621307373, + "sampling/sampling_logp_difference/mean": 0.016710273921489716, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 217.140625, + "completions/mean_terminated_length": 217.140625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.31315773725509644, + "epoch": 1.8051470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7123886413960567, + "kl": 0.1587759405374527, + "learning_rate": 4.1247444288312895e-07, + "loss": -0.0008, + "num_tokens": 46482403.0, + "reward": -0.34375, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996099472045898, + "sampling/importance_sampling_ratio/min": 0.3785988986492157, + "sampling/sampling_logp_difference/max": 2.326936721801758, + "sampling/sampling_logp_difference/mean": 0.017582248896360397, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 294.796875, + "completions/mean_terminated_length": 294.796875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.41033393144607544, + "epoch": 1.8063725490196079, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4855582911495697, + "kl": 0.13153117895126343, + "learning_rate": 4.1177315227105926e-07, + "loss": 0.0051, + "num_tokens": 46522150.0, + "reward": 0.28125, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.714239239692688, + "sampling/importance_sampling_ratio/mean": 0.9994593858718872, + "sampling/importance_sampling_ratio/min": 0.6069623231887817, + "sampling/sampling_logp_difference/max": 0.5389693975448608, + "sampling/sampling_logp_difference/mean": 0.018105637282133102, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 222.171875, + "completions/mean_terminated_length": 222.171875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2713545560836792, + "epoch": 1.8075980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09772615448301242, + "kl": 0.12450879067182541, + "learning_rate": 4.1107204075390096e-07, + "loss": 0.0012, + "num_tokens": 46549713.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7520533800125122, + "sampling/importance_sampling_ratio/mean": 1.0002514123916626, + "sampling/importance_sampling_ratio/min": 0.5910095572471619, + "sampling/sampling_logp_difference/max": 0.5607883930206299, + "sampling/sampling_logp_difference/mean": 0.014145957306027412, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 213.859375, + "completions/mean_terminated_length": 213.859375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.24778100848197937, + "epoch": 1.8088235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3635704139500446, + "kl": 0.07690733671188354, + "learning_rate": 4.1037110975486617e-07, + "loss": 0.028, + "num_tokens": 46580568.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.83034086227417, + "sampling/importance_sampling_ratio/mean": 1.0001349449157715, + "sampling/importance_sampling_ratio/min": 0.5244014859199524, + "sampling/sampling_logp_difference/max": 0.6454976797103882, + "sampling/sampling_logp_difference/mean": 0.01386608649045229, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 270.734375, + "completions/mean_terminated_length": 270.734375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.2906627655029297, + "epoch": 1.8100490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.351512616836499, + "kl": 0.07863107323646545, + "learning_rate": 4.096703606968006e-07, + "loss": 0.063, + "num_tokens": 46615495.0, + "reward": 0.875, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6088883876800537, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 0.41614967584609985, + "sampling/sampling_logp_difference/max": 0.8767102956771851, + "sampling/sampling_logp_difference/mean": 0.013265259563922882, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 289.578125, + "completions/mean_terminated_length": 289.578125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.3211671710014343, + "epoch": 1.8112745098039216, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4997630477810504, + "kl": 0.10330028831958771, + "learning_rate": 4.0896979500218014e-07, + "loss": -0.0384, + "num_tokens": 46659532.0, + "reward": 0.125, + "reward_std": 0.6047805547714233, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998058080673218, + "sampling/importance_sampling_ratio/min": 0.5684040188789368, + "sampling/sampling_logp_difference/max": 0.7329325675964355, + "sampling/sampling_logp_difference/mean": 0.015124181285500526, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 249.078125, + "completions/mean_terminated_length": 249.078125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.23253154754638672, + "epoch": 1.8125, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.179699382372317, + "kl": 0.08571634441614151, + "learning_rate": 4.082694140931088e-07, + "loss": -0.0072, + "num_tokens": 46693073.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7971853017807007, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 0.46659648418426514, + "sampling/sampling_logp_difference/max": 0.7622904777526855, + "sampling/sampling_logp_difference/mean": 0.012605142779648304, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 182.84375, + "completions/mean_terminated_length": 182.84375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.27958163619041443, + "epoch": 1.8137254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05490407431691958, + "kl": 0.1917632669210434, + "learning_rate": 4.0756921939131563e-07, + "loss": 0.0017, + "num_tokens": 46721959.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8738871812820435, + "sampling/importance_sampling_ratio/mean": 1.0003198385238647, + "sampling/importance_sampling_ratio/min": 0.6499074101448059, + "sampling/sampling_logp_difference/max": 0.6280150413513184, + "sampling/sampling_logp_difference/mean": 0.014630744233727455, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 258.546875, + "completions/mean_terminated_length": 258.546875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3771745562553406, + "epoch": 1.8149509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05803421229739693, + "kl": 0.16859950125217438, + "learning_rate": 4.0686921231815155e-07, + "loss": 0.0016, + "num_tokens": 46756570.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6955606937408447, + "sampling/importance_sampling_ratio/mean": 0.9999762773513794, + "sampling/importance_sampling_ratio/min": 0.5583487153053284, + "sampling/sampling_logp_difference/max": 0.5827715396881104, + "sampling/sampling_logp_difference/mean": 0.017337318509817123, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 198.78125, + "completions/mean_terminated_length": 198.78125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.2628799378871918, + "epoch": 1.8161764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.04604941449462, + "kl": 0.15804724395275116, + "learning_rate": 4.0616939429458627e-07, + "loss": 0.0333, + "num_tokens": 46783628.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5277986526489258, + "sampling/importance_sampling_ratio/mean": 0.9998676776885986, + "sampling/importance_sampling_ratio/min": 0.6298384666442871, + "sampling/sampling_logp_difference/max": 0.462291955947876, + "sampling/sampling_logp_difference/mean": 0.014284651726484299, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 194.515625, + "completions/mean_terminated_length": 194.515625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.2097713053226471, + "epoch": 1.8174019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05423994249844911, + "kl": 0.08659006655216217, + "learning_rate": 4.0546976674120623e-07, + "loss": 0.0009, + "num_tokens": 46814925.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.62943696975708, + "sampling/importance_sampling_ratio/mean": 1.0001730918884277, + "sampling/importance_sampling_ratio/min": 0.6562926173210144, + "sampling/sampling_logp_difference/max": 0.4882345199584961, + "sampling/sampling_logp_difference/mean": 0.011546341702342033, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 252.90625, + "completions/mean_terminated_length": 252.90625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.35610705614089966, + "epoch": 1.8186274509803921, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.628361087929497, + "kl": 0.1090826690196991, + "learning_rate": 4.047703310782111e-07, + "loss": -0.0379, + "num_tokens": 46854375.0, + "reward": 0.28125, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003490447998047, + "sampling/importance_sampling_ratio/min": 0.4153086543083191, + "sampling/sampling_logp_difference/max": 0.8787332773208618, + "sampling/sampling_logp_difference/mean": 0.01803845912218094, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 233.453125, + "completions/mean_terminated_length": 233.453125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.2663761377334595, + "epoch": 1.8198529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9176760582059719, + "kl": 0.09884761273860931, + "learning_rate": 4.0407108872541105e-07, + "loss": -0.009, + "num_tokens": 46891204.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6473641395568848, + "sampling/importance_sampling_ratio/mean": 1.0004463195800781, + "sampling/importance_sampling_ratio/min": 0.6516190767288208, + "sampling/sampling_logp_difference/max": 0.4991765022277832, + "sampling/sampling_logp_difference/mean": 0.01306439470499754, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 195.03125, + "completions/mean_terminated_length": 195.03125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.28954237699508667, + "epoch": 1.821078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8364595052336887, + "kl": 0.1455627828836441, + "learning_rate": 4.0337204110222347e-07, + "loss": 0.0807, + "num_tokens": 46923334.0, + "reward": 0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.9398034811019897, + "sampling/importance_sampling_ratio/mean": 0.9994158744812012, + "sampling/importance_sampling_ratio/min": 0.5024063587188721, + "sampling/sampling_logp_difference/max": 0.6883460283279419, + "sampling/sampling_logp_difference/mean": 0.015754956752061844, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 202.53125, + "completions/mean_terminated_length": 202.53125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.23330318927764893, + "epoch": 1.8223039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2342846455729133, + "kl": 0.12264470756053925, + "learning_rate": 4.0267318962767076e-07, + "loss": 0.0136, + "num_tokens": 46953992.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9991086721420288, + "sampling/importance_sampling_ratio/min": 0.5747461915016174, + "sampling/sampling_logp_difference/max": 0.7172539234161377, + "sampling/sampling_logp_difference/mean": 0.013632211834192276, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 178.40625, + "completions/mean_terminated_length": 178.40625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2289418876171112, + "epoch": 1.8235294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.841502353638724, + "kl": 0.18628448247909546, + "learning_rate": 4.0197453572037747e-07, + "loss": 0.0018, + "num_tokens": 46983538.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001065731048584, + "sampling/importance_sampling_ratio/min": 0.6073968410491943, + "sampling/sampling_logp_difference/max": 0.754854679107666, + "sampling/sampling_logp_difference/mean": 0.013173737563192844, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 205.859375, + "completions/mean_terminated_length": 205.859375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.30589017271995544, + "epoch": 1.8247549019607843, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.243124549402031, + "kl": 0.12316029518842697, + "learning_rate": 4.0127608079856644e-07, + "loss": -0.1134, + "num_tokens": 47011913.0, + "reward": -0.4375, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.7125898599624634, + "sampling/importance_sampling_ratio/mean": 0.9999831914901733, + "sampling/importance_sampling_ratio/min": 0.4982426166534424, + "sampling/sampling_logp_difference/max": 0.6966681480407715, + "sampling/sampling_logp_difference/mean": 0.014620396308600903, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 169.359375, + "completions/mean_terminated_length": 169.359375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.25947123765945435, + "epoch": 1.8259803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4159056667900753, + "kl": 0.12732058763504028, + "learning_rate": 4.005778262800571e-07, + "loss": -0.0308, + "num_tokens": 47042592.0, + "reward": -0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004578828811646, + "sampling/importance_sampling_ratio/min": 0.3742866814136505, + "sampling/sampling_logp_difference/max": 0.9827332496643066, + "sampling/sampling_logp_difference/mean": 0.015355970710515976, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 175.75, + "completions/mean_terminated_length": 175.75, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2130059003829956, + "epoch": 1.8272058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8738946641892246, + "kl": 0.07923636585474014, + "learning_rate": 3.9987977358226175e-07, + "loss": 0.0033, + "num_tokens": 47074416.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6526607275009155, + "sampling/importance_sampling_ratio/mean": 1.0003128051757812, + "sampling/importance_sampling_ratio/min": 0.38447070121765137, + "sampling/sampling_logp_difference/max": 0.9558877944946289, + "sampling/sampling_logp_difference/mean": 0.01331046037375927, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 289.15625, + "completions/mean_terminated_length": 289.15625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.36798161268234253, + "epoch": 1.8284313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9583959489737331, + "kl": 0.09859618544578552, + "learning_rate": 3.991819241221835e-07, + "loss": 0.0271, + "num_tokens": 47125066.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995404481887817, + "sampling/importance_sampling_ratio/min": 0.48147302865982056, + "sampling/sampling_logp_difference/max": 0.8621160984039307, + "sampling/sampling_logp_difference/mean": 0.018618982285261154, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 228.578125, + "completions/mean_terminated_length": 228.578125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3434247672557831, + "epoch": 1.829656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.261896640877614, + "kl": 0.1256704330444336, + "learning_rate": 3.98484279316412e-07, + "loss": 0.0092, + "num_tokens": 47161151.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.7861305475234985, + "sampling/importance_sampling_ratio/mean": 0.9994518756866455, + "sampling/importance_sampling_ratio/min": 0.42127373814582825, + "sampling/sampling_logp_difference/max": 0.8644723892211914, + "sampling/sampling_logp_difference/mean": 0.01720285415649414, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 198.203125, + "completions/mean_terminated_length": 198.203125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.19486339390277863, + "epoch": 1.8308823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0524757162358044, + "kl": 0.07670879364013672, + "learning_rate": 3.977868405811223e-07, + "loss": 0.0007, + "num_tokens": 47189228.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9442415237426758, + "sampling/importance_sampling_ratio/mean": 1.0005048513412476, + "sampling/importance_sampling_ratio/min": 0.4719997048377991, + "sampling/sampling_logp_difference/max": 0.7507768869400024, + "sampling/sampling_logp_difference/mean": 0.012243038043379784, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 217.0, + "completions/mean_terminated_length": 217.0, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2507721185684204, + "epoch": 1.8321078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04301824966306972, + "kl": 0.07585926353931427, + "learning_rate": 3.970896093320708e-07, + "loss": 0.0008, + "num_tokens": 47220700.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6032006740570068, + "sampling/importance_sampling_ratio/mean": 1.0000191926956177, + "sampling/importance_sampling_ratio/min": 0.33986422419548035, + "sampling/sampling_logp_difference/max": 1.0792090892791748, + "sampling/sampling_logp_difference/mean": 0.014406761154532433, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 205.75, + "completions/mean_terminated_length": 205.75, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.33627083897590637, + "epoch": 1.8333333333333335, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0551102354014685, + "kl": 0.12954556941986084, + "learning_rate": 3.9639258698459287e-07, + "loss": 0.0234, + "num_tokens": 47251628.0, + "reward": 0.65625, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.601887822151184, + "sampling/importance_sampling_ratio/mean": 1.0008529424667358, + "sampling/importance_sampling_ratio/min": 0.6152583360671997, + "sampling/sampling_logp_difference/max": 0.48571306467056274, + "sampling/sampling_logp_difference/mean": 0.016873590648174286, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 264.375, + "completions/mean_terminated_length": 264.375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.22614750266075134, + "epoch": 1.8345588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2858808512563478, + "kl": 0.0691496804356575, + "learning_rate": 3.9569577495359964e-07, + "loss": -0.0193, + "num_tokens": 47287236.0, + "reward": 0.625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.8390065431594849, + "sampling/importance_sampling_ratio/mean": 0.9997743368148804, + "sampling/importance_sampling_ratio/min": 0.4419277310371399, + "sampling/sampling_logp_difference/max": 0.8166089057922363, + "sampling/sampling_logp_difference/mean": 0.011157220229506493, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 205.5625, + "completions/mean_terminated_length": 205.5625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3292863070964813, + "epoch": 1.8357843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4682332015823245, + "kl": 0.1530056744813919, + "learning_rate": 3.949991746535753e-07, + "loss": -0.0221, + "num_tokens": 47317560.0, + "reward": 0.34375, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5506867170333862, + "sampling/importance_sampling_ratio/mean": 0.999796450138092, + "sampling/importance_sampling_ratio/min": 0.5594758987426758, + "sampling/sampling_logp_difference/max": 0.5807547569274902, + "sampling/sampling_logp_difference/mean": 0.01646328717470169, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 175.125, + "completions/mean_terminated_length": 175.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.19123998284339905, + "epoch": 1.8370098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05356769945636859, + "kl": 0.08881576359272003, + "learning_rate": 3.943027874985746e-07, + "loss": 0.0008, + "num_tokens": 47349600.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9949634075164795, + "sampling/importance_sampling_ratio/mean": 1.0006195306777954, + "sampling/importance_sampling_ratio/min": 0.5720028281211853, + "sampling/sampling_logp_difference/max": 0.6906256675720215, + "sampling/sampling_logp_difference/mean": 0.012663639150559902, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 198.140625, + "completions/mean_terminated_length": 198.140625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.22961711883544922, + "epoch": 1.8382352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09255699113039914, + "kl": 0.08062359690666199, + "learning_rate": 3.9360661490221904e-07, + "loss": 0.0008, + "num_tokens": 47388361.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9732640981674194, + "sampling/importance_sampling_ratio/mean": 0.9990033507347107, + "sampling/importance_sampling_ratio/min": 0.46037212014198303, + "sampling/sampling_logp_difference/max": 0.7757201194763184, + "sampling/sampling_logp_difference/mean": 0.014116080477833748, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 213.375, + "completions/mean_terminated_length": 213.375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2590044438838959, + "epoch": 1.8394607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4592039551549316, + "kl": 0.1066092699766159, + "learning_rate": 3.929106582776948e-07, + "loss": 0.0136, + "num_tokens": 47419361.0, + "reward": 0.0, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6243494749069214, + "sampling/importance_sampling_ratio/mean": 0.9995856881141663, + "sampling/importance_sampling_ratio/min": 0.48131364583969116, + "sampling/sampling_logp_difference/max": 0.7312362194061279, + "sampling/sampling_logp_difference/mean": 0.014168147929012775, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 171.4375, + "completions/mean_terminated_length": 171.4375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3639901876449585, + "epoch": 1.840686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.304178011378884, + "kl": 0.1234322190284729, + "learning_rate": 3.9221491903775013e-07, + "loss": 0.0157, + "num_tokens": 47452429.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.475376844406128, + "sampling/importance_sampling_ratio/mean": 1.0000149011611938, + "sampling/importance_sampling_ratio/min": 0.613954484462738, + "sampling/sampling_logp_difference/max": 0.48783445358276367, + "sampling/sampling_logp_difference/mean": 0.01782539114356041, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 187.171875, + "completions/mean_terminated_length": 187.171875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2886807918548584, + "epoch": 1.8419117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9699376969092584, + "kl": 0.10959329456090927, + "learning_rate": 3.9151939859469166e-07, + "loss": -0.0429, + "num_tokens": 47483240.0, + "reward": 0.1875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.7225044965744019, + "sampling/importance_sampling_ratio/mean": 0.9997414350509644, + "sampling/importance_sampling_ratio/min": 0.4769878685474396, + "sampling/sampling_logp_difference/max": 0.7402641773223877, + "sampling/sampling_logp_difference/mean": 0.01478142011910677, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 166.25, + "completions/mean_terminated_length": 166.25, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2390981912612915, + "epoch": 1.843137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5250096918124199, + "kl": 0.108737051486969, + "learning_rate": 3.908240983603813e-07, + "loss": 0.0073, + "num_tokens": 47512808.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.552701711654663, + "sampling/importance_sampling_ratio/mean": 0.999780535697937, + "sampling/importance_sampling_ratio/min": 0.6056217551231384, + "sampling/sampling_logp_difference/max": 0.5014996528625488, + "sampling/sampling_logp_difference/mean": 0.012281188741326332, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 193.375, + "completions/mean_terminated_length": 193.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.21787826716899872, + "epoch": 1.844362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3875391936463561, + "kl": 0.09802134335041046, + "learning_rate": 3.9012901974623476e-07, + "loss": 0.0085, + "num_tokens": 47540016.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.8179765939712524, + "sampling/importance_sampling_ratio/mean": 0.9997023344039917, + "sampling/importance_sampling_ratio/min": 0.5782334208488464, + "sampling/sampling_logp_difference/max": 0.5977240800857544, + "sampling/sampling_logp_difference/mean": 0.013284020125865936, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 245.71875, + "completions/mean_terminated_length": 245.71875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.28870612382888794, + "epoch": 1.8455882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.22860420168087, + "kl": 0.10918144136667252, + "learning_rate": 3.894341641632176e-07, + "loss": -0.0241, + "num_tokens": 47579214.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.6286340951919556, + "sampling/importance_sampling_ratio/mean": 0.9998392462730408, + "sampling/importance_sampling_ratio/min": 0.6145747303962708, + "sampling/sampling_logp_difference/max": 0.48774170875549316, + "sampling/sampling_logp_difference/mean": 0.014949234202504158, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 227.46875, + "completions/mean_terminated_length": 227.46875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2842986583709717, + "epoch": 1.846813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0996649217388619, + "kl": 0.11265279352664948, + "learning_rate": 3.8873953302184283e-07, + "loss": -0.004, + "num_tokens": 47613676.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6251676082611084, + "sampling/importance_sampling_ratio/mean": 1.0002528429031372, + "sampling/importance_sampling_ratio/min": 0.6164861917495728, + "sampling/sampling_logp_difference/max": 0.4856109619140625, + "sampling/sampling_logp_difference/mean": 0.014792348258197308, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 226.34375, + "completions/mean_terminated_length": 226.34375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2616327404975891, + "epoch": 1.8480392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2053015883865053, + "kl": 0.09128358960151672, + "learning_rate": 3.880451277321673e-07, + "loss": -0.0056, + "num_tokens": 47645954.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6281298398971558, + "sampling/importance_sampling_ratio/mean": 0.9998895525932312, + "sampling/importance_sampling_ratio/min": 0.6181973814964294, + "sampling/sampling_logp_difference/max": 0.48743200302124023, + "sampling/sampling_logp_difference/mean": 0.014848420396447182, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 138.84375, + "completions/mean_terminated_length": 138.84375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.16786305606365204, + "epoch": 1.8492647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06030377569260939, + "kl": 0.08539316058158875, + "learning_rate": 3.873509497037899e-07, + "loss": 0.0009, + "num_tokens": 47674200.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.55501389503479, + "sampling/importance_sampling_ratio/mean": 0.9998781085014343, + "sampling/importance_sampling_ratio/min": 0.6359153389930725, + "sampling/sampling_logp_difference/max": 0.45268988609313965, + "sampling/sampling_logp_difference/mean": 0.011962493881583214, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 195.25, + "completions/mean_terminated_length": 195.25, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2259249985218048, + "epoch": 1.8504901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.189394489035492, + "kl": 0.08550411462783813, + "learning_rate": 3.8665700034584834e-07, + "loss": -0.008, + "num_tokens": 47706040.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001052618026733, + "sampling/importance_sampling_ratio/min": 0.6622968316078186, + "sampling/sampling_logp_difference/max": 0.8150925636291504, + "sampling/sampling_logp_difference/mean": 0.013018874451518059, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 151.9375, + "completions/mean_terminated_length": 151.9375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.21084097027778625, + "epoch": 1.8517156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1387918498637539, + "kl": 0.10547801852226257, + "learning_rate": 3.8596328106701533e-07, + "loss": 0.0011, + "num_tokens": 47729732.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6024487018585205, + "sampling/importance_sampling_ratio/mean": 1.0001769065856934, + "sampling/importance_sampling_ratio/min": 0.6019724607467651, + "sampling/sampling_logp_difference/max": 0.5075435638427734, + "sampling/sampling_logp_difference/mean": 0.013056870549917221, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 183.203125, + "completions/mean_terminated_length": 183.203125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3111172914505005, + "epoch": 1.8529411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7726465743314133, + "kl": 0.12318593263626099, + "learning_rate": 3.8526979327549736e-07, + "loss": 0.0036, + "num_tokens": 47764209.0, + "reward": 0.59375, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5539941787719727, + "sampling/importance_sampling_ratio/mean": 1.0003302097320557, + "sampling/importance_sampling_ratio/min": 0.4000977575778961, + "sampling/sampling_logp_difference/max": 0.9160463809967041, + "sampling/sampling_logp_difference/mean": 0.01625010371208191, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 164.34375, + "completions/mean_terminated_length": 164.34375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.2743145823478699, + "epoch": 1.8541666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051260549816440265, + "kl": 0.1191352903842926, + "learning_rate": 3.845765383790306e-07, + "loss": 0.0012, + "num_tokens": 47790615.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5348610877990723, + "sampling/importance_sampling_ratio/mean": 1.0002870559692383, + "sampling/importance_sampling_ratio/min": 0.6147403120994568, + "sampling/sampling_logp_difference/max": 0.4865553379058838, + "sampling/sampling_logp_difference/mean": 0.01568988896906376, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 164.21875, + "completions/mean_terminated_length": 164.21875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3106948137283325, + "epoch": 1.8553921568627452, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8744626546891598, + "kl": 0.20338189601898193, + "learning_rate": 3.8388351778487875e-07, + "loss": 0.0002, + "num_tokens": 47820357.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.9546488523483276, + "sampling/importance_sampling_ratio/mean": 1.000866174697876, + "sampling/importance_sampling_ratio/min": 0.17713141441345215, + "sampling/sampling_logp_difference/max": 1.730863332748413, + "sampling/sampling_logp_difference/mean": 0.015201722271740437, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 164.75, + "completions/mean_terminated_length": 164.75, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.26740169525146484, + "epoch": 1.8566176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.556234841472086, + "kl": 0.13963264226913452, + "learning_rate": 3.831907328998295e-07, + "loss": 0.0026, + "num_tokens": 47850469.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994199275970459, + "sampling/importance_sampling_ratio/min": 0.4526996910572052, + "sampling/sampling_logp_difference/max": 0.9892764091491699, + "sampling/sampling_logp_difference/mean": 0.015211023390293121, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 150.96875, + "completions/mean_terminated_length": 150.96875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.21206092834472656, + "epoch": 1.857843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5210169519150867, + "kl": 0.08909106254577637, + "learning_rate": 3.824981851301924e-07, + "loss": -0.0137, + "num_tokens": 47877555.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.3761188983917236, + "sampling/importance_sampling_ratio/mean": 1.0001142024993896, + "sampling/importance_sampling_ratio/min": 0.500291645526886, + "sampling/sampling_logp_difference/max": 0.6925640106201172, + "sampling/sampling_logp_difference/mean": 0.012899190187454224, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 162.59375, + "completions/mean_terminated_length": 162.59375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2940405309200287, + "epoch": 1.8590686274509802, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9957742558492082, + "kl": 0.17864391207695007, + "learning_rate": 3.818058758817955e-07, + "loss": 0.0082, + "num_tokens": 47908361.0, + "reward": 0.28125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6625418663024902, + "sampling/importance_sampling_ratio/mean": 1.0001530647277832, + "sampling/importance_sampling_ratio/min": 0.5949472188949585, + "sampling/sampling_logp_difference/max": 0.5192825794219971, + "sampling/sampling_logp_difference/mean": 0.014901855029165745, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 160.71875, + "completions/mean_terminated_length": 160.71875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2383217215538025, + "epoch": 1.8602941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053719037874208904, + "kl": 0.09652819484472275, + "learning_rate": 3.81113806559983e-07, + "loss": 0.001, + "num_tokens": 47934455.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5130321979522705, + "sampling/importance_sampling_ratio/mean": 1.00014328956604, + "sampling/importance_sampling_ratio/min": 0.5200313329696655, + "sampling/sampling_logp_difference/max": 0.6538662910461426, + "sampling/sampling_logp_difference/mean": 0.01406307052820921, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 150.875, + "completions/mean_terminated_length": 150.875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2827237546443939, + "epoch": 1.8615196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3917265181523395, + "kl": 0.19590789079666138, + "learning_rate": 3.804219785696113e-07, + "loss": -0.0125, + "num_tokens": 47958239.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.628027319908142, + "sampling/importance_sampling_ratio/mean": 1.0000962018966675, + "sampling/importance_sampling_ratio/min": 0.6129540801048279, + "sampling/sampling_logp_difference/max": 0.48946523666381836, + "sampling/sampling_logp_difference/mean": 0.015526263043284416, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 174.9375, + "completions/mean_terminated_length": 174.9375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.19941727817058563, + "epoch": 1.8627450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2961982281604962, + "kl": 0.09546373784542084, + "learning_rate": 3.797303933150475e-07, + "loss": 0.0076, + "num_tokens": 47983739.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5509536266326904, + "sampling/importance_sampling_ratio/mean": 1.0001888275146484, + "sampling/importance_sampling_ratio/min": 0.5733482837677002, + "sampling/sampling_logp_difference/max": 0.5562620162963867, + "sampling/sampling_logp_difference/mean": 0.012389476411044598, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 227.15625, + "completions/mean_terminated_length": 227.15625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.23176491260528564, + "epoch": 1.8639705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9506818974268831, + "kl": 0.08148758858442307, + "learning_rate": 3.790390522001662e-07, + "loss": -0.0287, + "num_tokens": 48020021.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.8623803853988647, + "sampling/importance_sampling_ratio/mean": 1.0003161430358887, + "sampling/importance_sampling_ratio/min": 0.4322664439678192, + "sampling/sampling_logp_difference/max": 0.8387131690979004, + "sampling/sampling_logp_difference/mean": 0.013027187436819077, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 202.25, + "completions/mean_terminated_length": 202.25, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.28213566541671753, + "epoch": 1.8651960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6442198217386463, + "kl": 0.14837434887886047, + "learning_rate": 3.7834795662834566e-07, + "loss": 0.0212, + "num_tokens": 48050085.0, + "reward": 0.25, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6636914014816284, + "sampling/importance_sampling_ratio/mean": 1.000145673751831, + "sampling/importance_sampling_ratio/min": 0.6549220085144043, + "sampling/sampling_logp_difference/max": 0.5090389251708984, + "sampling/sampling_logp_difference/mean": 0.013476305641233921, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 243.46875, + "completions/mean_terminated_length": 243.46875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3794707655906677, + "epoch": 1.866421568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4424293410595619, + "kl": 0.1346874237060547, + "learning_rate": 3.776571080024663e-07, + "loss": 0.0186, + "num_tokens": 48089443.0, + "reward": 0.0625, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4868202209472656, + "sampling/importance_sampling_ratio/mean": 1.0007059574127197, + "sampling/importance_sampling_ratio/min": 0.47760945558547974, + "sampling/sampling_logp_difference/max": 0.738961935043335, + "sampling/sampling_logp_difference/mean": 0.017995767295360565, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 181.921875, + "completions/mean_terminated_length": 181.921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3146703243255615, + "epoch": 1.8676470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.175896482969458, + "kl": 0.10516366362571716, + "learning_rate": 3.76966507724907e-07, + "loss": 0.0082, + "num_tokens": 48125550.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009829998016357, + "sampling/importance_sampling_ratio/min": 0.12568892538547516, + "sampling/sampling_logp_difference/max": 2.2150397300720215, + "sampling/sampling_logp_difference/mean": 0.017225069925189018, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 234.796875, + "completions/mean_terminated_length": 234.796875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.34670189023017883, + "epoch": 1.8688725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5481841591682168, + "kl": 0.08109765499830246, + "learning_rate": 3.762761571975429e-07, + "loss": 0.0103, + "num_tokens": 48162801.0, + "reward": -0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6302809715270996, + "sampling/importance_sampling_ratio/mean": 0.9997133016586304, + "sampling/importance_sampling_ratio/min": 0.5842203497886658, + "sampling/sampling_logp_difference/max": 0.5374770164489746, + "sampling/sampling_logp_difference/mean": 0.017425820231437683, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 205.21875, + "completions/mean_terminated_length": 205.21875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3151162266731262, + "epoch": 1.8700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3857105972380215, + "kl": 0.11170510202646255, + "learning_rate": 3.755860578217413e-07, + "loss": -0.0017, + "num_tokens": 48195327.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6599576473236084, + "sampling/importance_sampling_ratio/mean": 0.99996018409729, + "sampling/importance_sampling_ratio/min": 0.4196990132331848, + "sampling/sampling_logp_difference/max": 0.8682174682617188, + "sampling/sampling_logp_difference/mean": 0.01472011860460043, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 181.171875, + "completions/mean_terminated_length": 181.171875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2721059322357178, + "epoch": 1.8713235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3365972261907422, + "kl": 0.12668752670288086, + "learning_rate": 3.7489621099836043e-07, + "loss": -0.0189, + "num_tokens": 48223498.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6299506425857544, + "sampling/importance_sampling_ratio/mean": 0.9992806911468506, + "sampling/importance_sampling_ratio/min": 0.37794965505599976, + "sampling/sampling_logp_difference/max": 0.972994327545166, + "sampling/sampling_logp_difference/mean": 0.014614992775022984, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 184.21875, + "completions/mean_terminated_length": 184.21875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.27154216170310974, + "epoch": 1.8725490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1836721743103784, + "kl": 0.120089091360569, + "learning_rate": 3.742066181277457e-07, + "loss": 0.0066, + "num_tokens": 48256392.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 1.0003831386566162, + "sampling/importance_sampling_ratio/min": 0.495414674282074, + "sampling/sampling_logp_difference/max": 0.7023601531982422, + "sampling/sampling_logp_difference/mean": 0.014493845403194427, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 225.78125, + "completions/mean_terminated_length": 225.78125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2942451536655426, + "epoch": 1.8737745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3398500817617414, + "kl": 0.09755627810955048, + "learning_rate": 3.735172806097271e-07, + "loss": -0.0237, + "num_tokens": 48291258.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.603361964225769, + "sampling/importance_sampling_ratio/mean": 1.0005296468734741, + "sampling/importance_sampling_ratio/min": 0.5283520817756653, + "sampling/sampling_logp_difference/max": 0.6379923820495605, + "sampling/sampling_logp_difference/mean": 0.015455886721611023, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 189.703125, + "completions/mean_terminated_length": 189.703125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.21763397753238678, + "epoch": 1.875, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2386752291814265, + "kl": 0.11837249249219894, + "learning_rate": 3.7282819984361577e-07, + "loss": 0.0011, + "num_tokens": 48321255.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6312732696533203, + "sampling/importance_sampling_ratio/mean": 0.9998139142990112, + "sampling/importance_sampling_ratio/min": 0.4972875118255615, + "sampling/sampling_logp_difference/max": 0.6985869407653809, + "sampling/sampling_logp_difference/mean": 0.012105113826692104, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 195.8125, + "completions/mean_terminated_length": 195.8125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.26901644468307495, + "epoch": 1.8762254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2460732346461223, + "kl": 0.10338087379932404, + "learning_rate": 3.721393772282022e-07, + "loss": 0.0037, + "num_tokens": 48349707.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.463740348815918, + "sampling/importance_sampling_ratio/mean": 0.9998382329940796, + "sampling/importance_sampling_ratio/min": 0.4661833643913269, + "sampling/sampling_logp_difference/max": 0.7631762027740479, + "sampling/sampling_logp_difference/mean": 0.013834136538207531, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 188.515625, + "completions/mean_terminated_length": 188.515625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2946900725364685, + "epoch": 1.8774509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.913459932908527, + "kl": 0.15940743684768677, + "learning_rate": 3.7145081416175264e-07, + "loss": 0.0169, + "num_tokens": 48379468.0, + "reward": 0.625, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.7782058715820312, + "sampling/importance_sampling_ratio/mean": 0.999894380569458, + "sampling/importance_sampling_ratio/min": 0.6610523462295532, + "sampling/sampling_logp_difference/max": 0.5756049156188965, + "sampling/sampling_logp_difference/mean": 0.014666395261883736, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 199.671875, + "completions/mean_terminated_length": 199.671875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.31731051206588745, + "epoch": 1.8786764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9024912622749615, + "kl": 0.14467984437942505, + "learning_rate": 3.7076251204200667e-07, + "loss": -0.1033, + "num_tokens": 48410983.0, + "reward": 0.21875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000901222229004, + "sampling/importance_sampling_ratio/min": 0.6036661267280579, + "sampling/sampling_logp_difference/max": 1.132408618927002, + "sampling/sampling_logp_difference/mean": 0.01580166071653366, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 170.671875, + "completions/mean_terminated_length": 170.671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2762795686721802, + "epoch": 1.8799019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6546919602424224, + "kl": 0.11146007478237152, + "learning_rate": 3.700744722661736e-07, + "loss": 0.0011, + "num_tokens": 48435954.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4841010570526123, + "sampling/importance_sampling_ratio/mean": 0.9997158646583557, + "sampling/importance_sampling_ratio/min": 0.6257337927818298, + "sampling/sampling_logp_difference/max": 0.4688303470611572, + "sampling/sampling_logp_difference/mean": 0.014529142528772354, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 146.40625, + "completions/mean_terminated_length": 146.40625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.26064950227737427, + "epoch": 1.8811274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08540739013399869, + "kl": 0.11784351617097855, + "learning_rate": 3.693866962309308e-07, + "loss": 0.0012, + "num_tokens": 48465788.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6977688074111938, + "sampling/importance_sampling_ratio/mean": 1.0002050399780273, + "sampling/importance_sampling_ratio/min": 0.6122998595237732, + "sampling/sampling_logp_difference/max": 0.5293148756027222, + "sampling/sampling_logp_difference/mean": 0.014467135071754456, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 187.703125, + "completions/mean_terminated_length": 187.703125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.34977987408638, + "epoch": 1.8823529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0112208247999641, + "kl": 0.1438819169998169, + "learning_rate": 3.686991853324202e-07, + "loss": 0.0137, + "num_tokens": 48497017.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 0.99979168176651, + "sampling/importance_sampling_ratio/min": 0.6265778541564941, + "sampling/sampling_logp_difference/max": 0.475541353225708, + "sampling/sampling_logp_difference/mean": 0.017088035121560097, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 191.1875, + "completions/mean_terminated_length": 191.1875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2127748429775238, + "epoch": 1.883578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05688519141386496, + "kl": 0.07666490226984024, + "learning_rate": 3.680119409662451e-07, + "loss": 0.0007, + "num_tokens": 48526053.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8571991920471191, + "sampling/importance_sampling_ratio/mean": 0.9998555183410645, + "sampling/importance_sampling_ratio/min": 0.4798246920108795, + "sampling/sampling_logp_difference/max": 0.7343344688415527, + "sampling/sampling_logp_difference/mean": 0.012846048921346664, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 202.640625, + "completions/mean_terminated_length": 202.640625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3353343605995178, + "epoch": 1.8848039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.813218533983896, + "kl": 0.1376620978116989, + "learning_rate": 3.673249645274682e-07, + "loss": -0.0391, + "num_tokens": 48557358.0, + "reward": 0.78125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.8516814708709717, + "sampling/importance_sampling_ratio/mean": 1.0001487731933594, + "sampling/importance_sampling_ratio/min": 0.47982609272003174, + "sampling/sampling_logp_difference/max": 0.7343316078186035, + "sampling/sampling_logp_difference/mean": 0.016058053821325302, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 166.265625, + "completions/mean_terminated_length": 166.265625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.27629363536834717, + "epoch": 1.8860294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04776955300620745, + "kl": 0.09619492292404175, + "learning_rate": 3.6663825741060805e-07, + "loss": 0.001, + "num_tokens": 48588079.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997478723526001, + "sampling/importance_sampling_ratio/min": 0.3985402286052704, + "sampling/sampling_logp_difference/max": 0.9277105331420898, + "sampling/sampling_logp_difference/mean": 0.017165351659059525, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 174.671875, + "completions/mean_terminated_length": 174.671875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.29174095392227173, + "epoch": 1.8872549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5602647270473926, + "kl": 0.11156705021858215, + "learning_rate": 3.6595182100963686e-07, + "loss": 0.0035, + "num_tokens": 48614938.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6307547092437744, + "sampling/importance_sampling_ratio/mean": 0.9993473887443542, + "sampling/importance_sampling_ratio/min": 0.5263680815696716, + "sampling/sampling_logp_difference/max": 0.6417546272277832, + "sampling/sampling_logp_difference/mean": 0.014674471691250801, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 203.859375, + "completions/mean_terminated_length": 203.859375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.28613030910491943, + "epoch": 1.8884803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5430968543347237, + "kl": 0.16828909516334534, + "learning_rate": 3.652656567179765e-07, + "loss": 0.0035, + "num_tokens": 48643457.0, + "reward": 0.15625, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.9935235977172852, + "sampling/importance_sampling_ratio/mean": 0.999808669090271, + "sampling/importance_sampling_ratio/min": 0.6217340230941772, + "sampling/sampling_logp_difference/max": 0.689903736114502, + "sampling/sampling_logp_difference/mean": 0.01587986946105957, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 182.09375, + "completions/mean_terminated_length": 182.09375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.30671757459640503, + "epoch": 1.8897058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6157744644914547, + "kl": 0.15455128252506256, + "learning_rate": 3.645797659284975e-07, + "loss": 0.0167, + "num_tokens": 48669799.0, + "reward": 0.25, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6490719318389893, + "sampling/importance_sampling_ratio/mean": 1.0000280141830444, + "sampling/importance_sampling_ratio/min": 0.6067567467689514, + "sampling/sampling_logp_difference/max": 0.5002126693725586, + "sampling/sampling_logp_difference/mean": 0.01575349271297455, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 203.21875, + "completions/mean_terminated_length": 203.21875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3348330855369568, + "epoch": 1.8909313725490198, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5496038380993364, + "kl": 0.1350516676902771, + "learning_rate": 3.638941500335144e-07, + "loss": 0.0338, + "num_tokens": 48700517.0, + "reward": -0.125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.96829354763031, + "sampling/importance_sampling_ratio/mean": 0.9999657273292542, + "sampling/importance_sampling_ratio/min": 0.605185866355896, + "sampling/sampling_logp_difference/max": 0.6771669387817383, + "sampling/sampling_logp_difference/mean": 0.01601865142583847, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 237.171875, + "completions/mean_terminated_length": 237.171875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.35526013374328613, + "epoch": 1.892156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.340869176986576, + "kl": 0.1703725904226303, + "learning_rate": 3.6320881042478433e-07, + "loss": -0.0082, + "num_tokens": 48731824.0, + "reward": 0.84375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.7339519262313843, + "sampling/importance_sampling_ratio/mean": 0.999858021736145, + "sampling/importance_sampling_ratio/min": 0.35683271288871765, + "sampling/sampling_logp_difference/max": 1.0304882526397705, + "sampling/sampling_logp_difference/mean": 0.017715346068143845, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 150.1875, + "completions/mean_terminated_length": 150.1875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.219431072473526, + "epoch": 1.8933823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6490211239168067, + "kl": 0.1438380926847458, + "learning_rate": 3.6252374849350303e-07, + "loss": -0.0216, + "num_tokens": 48759916.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.85312819480896, + "sampling/importance_sampling_ratio/mean": 0.9996017217636108, + "sampling/importance_sampling_ratio/min": 0.629998505115509, + "sampling/sampling_logp_difference/max": 0.616875171661377, + "sampling/sampling_logp_difference/mean": 0.013308055698871613, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 249.546875, + "completions/mean_terminated_length": 249.546875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.22123563289642334, + "epoch": 1.8946078431372548, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.516266467309043, + "kl": 0.08375102281570435, + "learning_rate": 3.618389656303029e-07, + "loss": -0.0845, + "num_tokens": 48794687.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.8146148920059204, + "sampling/importance_sampling_ratio/mean": 0.999570369720459, + "sampling/importance_sampling_ratio/min": 0.2815396189689636, + "sampling/sampling_logp_difference/max": 1.267482042312622, + "sampling/sampling_logp_difference/mean": 0.012413117103278637, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 202.84375, + "completions/mean_terminated_length": 202.84375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.278066486120224, + "epoch": 1.8958333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.857750462868285, + "kl": 0.09571290016174316, + "learning_rate": 3.6115446322525e-07, + "loss": -0.083, + "num_tokens": 48829205.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9986664056777954, + "sampling/importance_sampling_ratio/min": 0.20802132785320282, + "sampling/sampling_logp_difference/max": 1.5701146125793457, + "sampling/sampling_logp_difference/mean": 0.01835343800485134, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 216.421875, + "completions/mean_terminated_length": 216.421875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3158535957336426, + "epoch": 1.8970588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3937074063685995, + "kl": 0.11325500160455704, + "learning_rate": 3.6047024266784035e-07, + "loss": -0.0164, + "num_tokens": 48872864.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6009771823883057, + "sampling/importance_sampling_ratio/mean": 1.0001379251480103, + "sampling/importance_sampling_ratio/min": 0.44403478503227234, + "sampling/sampling_logp_difference/max": 0.8118524551391602, + "sampling/sampling_logp_difference/mean": 0.01701243221759796, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 246.6875, + "completions/mean_terminated_length": 246.6875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.33020949363708496, + "epoch": 1.8982843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3909020629085493, + "kl": 0.11101092398166656, + "learning_rate": 3.5978630534699865e-07, + "loss": -0.032, + "num_tokens": 48909500.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.8210878372192383, + "sampling/importance_sampling_ratio/mean": 1.0001590251922607, + "sampling/importance_sampling_ratio/min": 0.5914841890335083, + "sampling/sampling_logp_difference/max": 0.5994340181350708, + "sampling/sampling_logp_difference/mean": 0.015389536507427692, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 221.359375, + "completions/mean_terminated_length": 221.359375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.29521629214286804, + "epoch": 1.8995098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7727821848863863, + "kl": 0.08533985912799835, + "learning_rate": 3.591026526510742e-07, + "loss": -0.0259, + "num_tokens": 48944819.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000815510749817, + "sampling/importance_sampling_ratio/min": 0.5918084979057312, + "sampling/sampling_logp_difference/max": 0.7039074897766113, + "sampling/sampling_logp_difference/mean": 0.014909489080309868, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 234.25, + "completions/mean_terminated_length": 234.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.31491348147392273, + "epoch": 1.9007352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3101563688698865, + "kl": 0.09991556406021118, + "learning_rate": 3.584192859678391e-07, + "loss": 0.0041, + "num_tokens": 48977059.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.8134992122650146, + "sampling/importance_sampling_ratio/mean": 0.9999299049377441, + "sampling/importance_sampling_ratio/min": 0.4001094698905945, + "sampling/sampling_logp_difference/max": 0.9160170555114746, + "sampling/sampling_logp_difference/mean": 0.015464898198843002, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 227.96875, + "completions/mean_terminated_length": 227.96875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.371176540851593, + "epoch": 1.9019607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.650144214127249, + "kl": 0.12711381912231445, + "learning_rate": 3.577362066844838e-07, + "loss": 0.0399, + "num_tokens": 49009761.0, + "reward": 0.3125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6179018020629883, + "sampling/importance_sampling_ratio/mean": 0.9994966387748718, + "sampling/importance_sampling_ratio/min": 0.5008446574211121, + "sampling/sampling_logp_difference/max": 0.6914592981338501, + "sampling/sampling_logp_difference/mean": 0.017303530126810074, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 183.859375, + "completions/mean_terminated_length": 183.859375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.25452548265457153, + "epoch": 1.903186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3072688658521017, + "kl": 0.12985754013061523, + "learning_rate": 3.570534161876163e-07, + "loss": 0.021, + "num_tokens": 49036280.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999993622303009, + "sampling/importance_sampling_ratio/min": 0.4193432331085205, + "sampling/sampling_logp_difference/max": 0.8690655827522278, + "sampling/sampling_logp_difference/mean": 0.014714433811604977, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 204.515625, + "completions/mean_terminated_length": 204.515625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.2913321554660797, + "epoch": 1.9044117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3504290648689279, + "kl": 0.08884111791849136, + "learning_rate": 3.5637091586325796e-07, + "loss": -0.0062, + "num_tokens": 49072265.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002952814102173, + "sampling/importance_sampling_ratio/min": 0.4834056496620178, + "sampling/sampling_logp_difference/max": 0.931877613067627, + "sampling/sampling_logp_difference/mean": 0.01549257431179285, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 201.984375, + "completions/mean_terminated_length": 201.984375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.2502061128616333, + "epoch": 1.905637254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6647828098078714, + "kl": 0.10091829299926758, + "learning_rate": 3.556887070968414e-07, + "loss": 0.0131, + "num_tokens": 49103464.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007450580596924, + "sampling/importance_sampling_ratio/min": 0.48473289608955383, + "sampling/sampling_logp_difference/max": 0.7275445461273193, + "sampling/sampling_logp_difference/mean": 0.014591528102755547, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 211.03125, + "completions/mean_terminated_length": 211.03125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.30164915323257446, + "epoch": 1.906862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9772018158037367, + "kl": 0.09650573134422302, + "learning_rate": 3.550067912732069e-07, + "loss": 0.021, + "num_tokens": 49137610.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 1.0000648498535156, + "sampling/importance_sampling_ratio/min": 0.6071924567222595, + "sampling/sampling_logp_difference/max": 0.49890947341918945, + "sampling/sampling_logp_difference/mean": 0.015118611045181751, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 224.546875, + "completions/mean_terminated_length": 224.546875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3440561890602112, + "epoch": 1.9080882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6866728597437486, + "kl": 0.10658356547355652, + "learning_rate": 3.5432516977660054e-07, + "loss": 0.0174, + "num_tokens": 49169485.0, + "reward": 0.3125, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993762373924255, + "sampling/importance_sampling_ratio/min": 0.6254568696022034, + "sampling/sampling_logp_difference/max": 0.7467336654663086, + "sampling/sampling_logp_difference/mean": 0.016454674303531647, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 197.78125, + "completions/mean_terminated_length": 197.78125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.20059680938720703, + "epoch": 1.909313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04640111395933607, + "kl": 0.09063167124986649, + "learning_rate": 3.5364384399067094e-07, + "loss": 0.0009, + "num_tokens": 49199887.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993793964385986, + "sampling/importance_sampling_ratio/min": 0.41134488582611084, + "sampling/sampling_logp_difference/max": 0.8883233070373535, + "sampling/sampling_logp_difference/mean": 0.012949159368872643, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 179.515625, + "completions/mean_terminated_length": 179.515625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3543946444988251, + "epoch": 1.9105392156862746, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.382435530039299, + "kl": 0.12880370020866394, + "learning_rate": 3.5296281529846593e-07, + "loss": 0.0287, + "num_tokens": 49241648.0, + "reward": 0.125, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.8798727989196777, + "sampling/importance_sampling_ratio/mean": 1.0006365776062012, + "sampling/importance_sampling_ratio/min": 0.42330774664878845, + "sampling/sampling_logp_difference/max": 0.8596558570861816, + "sampling/sampling_logp_difference/mean": 0.019419439136981964, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 176.78125, + "completions/mean_terminated_length": 176.78125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.23964226245880127, + "epoch": 1.9117647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3608993259996585, + "kl": 0.11298610270023346, + "learning_rate": 3.5228208508243073e-07, + "loss": 0.0092, + "num_tokens": 49267362.0, + "reward": -0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6379655599594116, + "sampling/importance_sampling_ratio/mean": 1.0000442266464233, + "sampling/importance_sampling_ratio/min": 0.6133005619049072, + "sampling/sampling_logp_difference/max": 0.4934549331665039, + "sampling/sampling_logp_difference/mean": 0.013549616560339928, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 203.671875, + "completions/mean_terminated_length": 203.671875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.23592735826969147, + "epoch": 1.9129901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1011763669808319, + "kl": 0.09119807183742523, + "learning_rate": 3.5160165472440467e-07, + "loss": 0.0155, + "num_tokens": 49298365.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4173837900161743, + "sampling/importance_sampling_ratio/mean": 0.9997701048851013, + "sampling/importance_sampling_ratio/min": 0.3053518831729889, + "sampling/sampling_logp_difference/max": 1.1862905025482178, + "sampling/sampling_logp_difference/mean": 0.013066626153886318, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 194.796875, + "completions/mean_terminated_length": 194.796875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.31049007177352905, + "epoch": 1.9142156862745097, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.568139086966258, + "kl": 0.19013534486293793, + "learning_rate": 3.509215256056183e-07, + "loss": -0.0101, + "num_tokens": 49328656.0, + "reward": 0.46875, + "reward_std": 0.625, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5504263639450073, + "sampling/importance_sampling_ratio/mean": 0.9998918771743774, + "sampling/importance_sampling_ratio/min": 0.27835866808891296, + "sampling/sampling_logp_difference/max": 1.2788448333740234, + "sampling/sampling_logp_difference/mean": 0.0173039548099041, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 195.8125, + "completions/mean_terminated_length": 195.8125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2837347984313965, + "epoch": 1.9154411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7436632255652411, + "kl": 0.09441693127155304, + "learning_rate": 3.502416991066904e-07, + "loss": 0.0184, + "num_tokens": 49357812.0, + "reward": 0.4375, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5426450967788696, + "sampling/importance_sampling_ratio/mean": 0.9997560977935791, + "sampling/importance_sampling_ratio/min": 0.6066765785217285, + "sampling/sampling_logp_difference/max": 0.4997594356536865, + "sampling/sampling_logp_difference/mean": 0.01340518333017826, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 211.078125, + "completions/mean_terminated_length": 211.078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.27355197072029114, + "epoch": 1.9166666666666665, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.777329766937861, + "kl": 0.07297226041555405, + "learning_rate": 3.495621766076259e-07, + "loss": 0.0355, + "num_tokens": 49389689.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.635582685470581, + "sampling/importance_sampling_ratio/mean": 1.000390887260437, + "sampling/importance_sampling_ratio/min": 0.6435860395431519, + "sampling/sampling_logp_difference/max": 0.49199914932250977, + "sampling/sampling_logp_difference/mean": 0.014909939840435982, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 185.359375, + "completions/mean_terminated_length": 185.359375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.23646235466003418, + "epoch": 1.9178921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05513252817736153, + "kl": 0.1208547055721283, + "learning_rate": 3.488829594878123e-07, + "loss": 0.0011, + "num_tokens": 49418960.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.620741367340088, + "sampling/importance_sampling_ratio/mean": 0.9998612403869629, + "sampling/importance_sampling_ratio/min": 0.5412285923957825, + "sampling/sampling_logp_difference/max": 0.6139135360717773, + "sampling/sampling_logp_difference/mean": 0.014054106548428535, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 173.453125, + "completions/mean_terminated_length": 173.453125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.28601402044296265, + "epoch": 1.9191176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04753205139430537, + "kl": 0.09612636268138885, + "learning_rate": 3.4820404912601757e-07, + "loss": 0.001, + "num_tokens": 49451613.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.793167233467102, + "sampling/importance_sampling_ratio/mean": 1.0006403923034668, + "sampling/importance_sampling_ratio/min": 0.5947459936141968, + "sampling/sampling_logp_difference/max": 0.5839834213256836, + "sampling/sampling_logp_difference/mean": 0.01622890681028366, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 183.71875, + "completions/mean_terminated_length": 183.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.20884793996810913, + "epoch": 1.920343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07671820774126588, + "kl": 0.07233886420726776, + "learning_rate": 3.4752544690038643e-07, + "loss": 0.0007, + "num_tokens": 49479867.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000246524810791, + "sampling/importance_sampling_ratio/min": 0.21938800811767578, + "sampling/sampling_logp_difference/max": 1.5169134140014648, + "sampling/sampling_logp_difference/mean": 0.012974189594388008, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 220.4375, + "completions/mean_terminated_length": 220.4375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3239063620567322, + "epoch": 1.9215686274509802, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6904427287074242, + "kl": 0.150712251663208, + "learning_rate": 3.468471541884385e-07, + "loss": 0.077, + "num_tokens": 49508743.0, + "reward": 0.25, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996858835220337, + "sampling/importance_sampling_ratio/min": 0.6147395372390747, + "sampling/sampling_logp_difference/max": 0.711000919342041, + "sampling/sampling_logp_difference/mean": 0.016206160187721252, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 224.703125, + "completions/mean_terminated_length": 224.703125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2642619013786316, + "epoch": 1.9227941176470589, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8621671333958125, + "kl": 0.0999397486448288, + "learning_rate": 3.461691723670651e-07, + "loss": 0.0122, + "num_tokens": 49539140.0, + "reward": 0.09375, + "reward_std": 0.565913200378418, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6767631769180298, + "sampling/importance_sampling_ratio/mean": 0.9997922778129578, + "sampling/importance_sampling_ratio/min": 0.4298660457134247, + "sampling/sampling_logp_difference/max": 0.8442816734313965, + "sampling/sampling_logp_difference/mean": 0.014188846573233604, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 153.328125, + "completions/mean_terminated_length": 153.328125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.20939096808433533, + "epoch": 1.9240196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07113598995777741, + "kl": 0.08482904732227325, + "learning_rate": 3.454915028125263e-07, + "loss": 0.0008, + "num_tokens": 49566601.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8483213186264038, + "sampling/importance_sampling_ratio/mean": 1.0005934238433838, + "sampling/importance_sampling_ratio/min": 0.536852240562439, + "sampling/sampling_logp_difference/max": 0.6220324039459229, + "sampling/sampling_logp_difference/mean": 0.013298461213707924, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 173.796875, + "completions/mean_terminated_length": 173.796875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.29373040795326233, + "epoch": 1.9252450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6116127291643694, + "kl": 0.10015298426151276, + "learning_rate": 3.4481414690044836e-07, + "loss": 0.0044, + "num_tokens": 49598204.0, + "reward": -0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6278108358383179, + "sampling/importance_sampling_ratio/mean": 1.0005826950073242, + "sampling/importance_sampling_ratio/min": 0.6176396012306213, + "sampling/sampling_logp_difference/max": 0.48723602294921875, + "sampling/sampling_logp_difference/mean": 0.015931256115436554, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 197.53125, + "completions/mean_terminated_length": 197.53125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.210597962141037, + "epoch": 1.9264705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.712648102169922, + "kl": 0.08676162362098694, + "learning_rate": 3.441371060058209e-07, + "loss": 0.0443, + "num_tokens": 49629326.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0011491775512695, + "sampling/importance_sampling_ratio/min": 0.6279329061508179, + "sampling/sampling_logp_difference/max": 0.8303616046905518, + "sampling/sampling_logp_difference/mean": 0.012944528833031654, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 149.671875, + "completions/mean_terminated_length": 149.671875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.253029465675354, + "epoch": 1.9276960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0507027200014654, + "kl": 0.08234013617038727, + "learning_rate": 3.4346038150299425e-07, + "loss": 0.0008, + "num_tokens": 49651897.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6307547092437744, + "sampling/importance_sampling_ratio/mean": 0.999323844909668, + "sampling/importance_sampling_ratio/min": 0.5186219811439514, + "sampling/sampling_logp_difference/max": 0.6565799713134766, + "sampling/sampling_logp_difference/mean": 0.016448473557829857, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 190.0625, + "completions/mean_terminated_length": 190.0625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2652842104434967, + "epoch": 1.928921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6480919424848328, + "kl": 0.10174965858459473, + "learning_rate": 3.427839747656758e-07, + "loss": 0.0169, + "num_tokens": 49683677.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.605762243270874, + "sampling/importance_sampling_ratio/mean": 0.9999074935913086, + "sampling/importance_sampling_ratio/min": 0.48236799240112305, + "sampling/sampling_logp_difference/max": 0.729047954082489, + "sampling/sampling_logp_difference/mean": 0.014856329187750816, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 164.5, + "completions/mean_terminated_length": 164.5, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.24237948656082153, + "epoch": 1.9301470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.055333795911757, + "kl": 0.10452765226364136, + "learning_rate": 3.4210788716692875e-07, + "loss": 0.0259, + "num_tokens": 49710045.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5084569454193115, + "sampling/importance_sampling_ratio/mean": 1.0006563663482666, + "sampling/importance_sampling_ratio/min": 0.6150839924812317, + "sampling/sampling_logp_difference/max": 0.4859964847564697, + "sampling/sampling_logp_difference/mean": 0.012377345934510231, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 175.703125, + "completions/mean_terminated_length": 175.703125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.35965946316719055, + "epoch": 1.9313725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6300033230824567, + "kl": 0.15588462352752686, + "learning_rate": 3.414321200791679e-07, + "loss": -0.0031, + "num_tokens": 49745786.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5989322662353516, + "sampling/importance_sampling_ratio/mean": 1.000009536743164, + "sampling/importance_sampling_ratio/min": 0.5671254396438599, + "sampling/sampling_logp_difference/max": 0.5671747922897339, + "sampling/sampling_logp_difference/mean": 0.01786952093243599, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 194.65625, + "completions/mean_terminated_length": 194.65625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3209313750267029, + "epoch": 1.9325980392156863, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3238252262505257, + "kl": 0.1693599373102188, + "learning_rate": 3.4075667487415785e-07, + "loss": -0.0046, + "num_tokens": 49779076.0, + "reward": 0.59375, + "reward_std": 0.659286618232727, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5972504615783691, + "sampling/importance_sampling_ratio/mean": 1.0004091262817383, + "sampling/importance_sampling_ratio/min": 0.6081857681274414, + "sampling/sampling_logp_difference/max": 0.49727487564086914, + "sampling/sampling_logp_difference/mean": 0.016534799709916115, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 190.53125, + "completions/mean_terminated_length": 190.53125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.35380131006240845, + "epoch": 1.9338235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15518261084076634, + "kl": 0.12272585928440094, + "learning_rate": 3.4008155292300934e-07, + "loss": 0.0013, + "num_tokens": 49806102.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5486302375793457, + "sampling/importance_sampling_ratio/mean": 0.9994552731513977, + "sampling/importance_sampling_ratio/min": 0.6216931343078613, + "sampling/sampling_logp_difference/max": 0.4753086566925049, + "sampling/sampling_logp_difference/mean": 0.017209038138389587, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 195.265625, + "completions/mean_terminated_length": 195.265625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2244930863380432, + "epoch": 1.9350490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9183236785939335, + "kl": 0.08692613244056702, + "learning_rate": 3.3940675559617723e-07, + "loss": -0.0734, + "num_tokens": 49840839.0, + "reward": -0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6320115327835083, + "sampling/importance_sampling_ratio/mean": 1.000977635383606, + "sampling/importance_sampling_ratio/min": 0.5341154932975769, + "sampling/sampling_logp_difference/max": 0.627143144607544, + "sampling/sampling_logp_difference/mean": 0.014392497949302197, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 211.53125, + "completions/mean_terminated_length": 211.53125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3096480369567871, + "epoch": 1.9362745098039216, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7361189983841854, + "kl": 0.15013748407363892, + "learning_rate": 3.3873228426345757e-07, + "loss": 0.0301, + "num_tokens": 49867225.0, + "reward": -0.0625, + "reward_std": 0.8220869898796082, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6300376653671265, + "sampling/importance_sampling_ratio/mean": 1.0004205703735352, + "sampling/importance_sampling_ratio/min": 0.587834358215332, + "sampling/sampling_logp_difference/max": 0.5313100814819336, + "sampling/sampling_logp_difference/mean": 0.015822414308786392, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 185.03125, + "completions/mean_terminated_length": 185.03125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2975696921348572, + "epoch": 1.9375, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2834407010139355, + "kl": 0.1522301435470581, + "learning_rate": 3.380581402939841e-07, + "loss": 0.0249, + "num_tokens": 49892683.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.85191011428833, + "sampling/importance_sampling_ratio/mean": 0.9997633099555969, + "sampling/importance_sampling_ratio/min": 0.4871048033237457, + "sampling/sampling_logp_difference/max": 0.719275951385498, + "sampling/sampling_logp_difference/mean": 0.015590054914355278, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 217.609375, + "completions/mean_terminated_length": 217.609375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.2924651503562927, + "epoch": 1.9387254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5024869556632245, + "kl": 0.09442883729934692, + "learning_rate": 3.373843250562265e-07, + "loss": -0.0221, + "num_tokens": 49927778.0, + "reward": 0.46875, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995254874229431, + "sampling/importance_sampling_ratio/min": 0.4735029935836792, + "sampling/sampling_logp_difference/max": 0.814455509185791, + "sampling/sampling_logp_difference/mean": 0.01673107221722603, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 203.90625, + "completions/mean_terminated_length": 203.90625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.23768167197704315, + "epoch": 1.9399509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.112366008579369, + "kl": 0.14769932627677917, + "learning_rate": 3.3671083991798697e-07, + "loss": -0.0012, + "num_tokens": 49956188.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6751999855041504, + "sampling/importance_sampling_ratio/mean": 0.9998273849487305, + "sampling/importance_sampling_ratio/min": 0.6054726243019104, + "sampling/sampling_logp_difference/max": 0.515932559967041, + "sampling/sampling_logp_difference/mean": 0.01382248941808939, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 186.5, + "completions/mean_terminated_length": 186.5, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.21476219594478607, + "epoch": 1.9411764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04603193232824236, + "kl": 0.07713761180639267, + "learning_rate": 3.360376862463978e-07, + "loss": 0.0008, + "num_tokens": 49982236.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001596212387085, + "sampling/importance_sampling_ratio/min": 0.5286228656768799, + "sampling/sampling_logp_difference/max": 0.7049002647399902, + "sampling/sampling_logp_difference/mean": 0.01282772608101368, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 205.71875, + "completions/mean_terminated_length": 205.71875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.27220970392227173, + "epoch": 1.9424019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7500272508696701, + "kl": 0.08735904097557068, + "learning_rate": 3.3536486540791823e-07, + "loss": -0.0007, + "num_tokens": 50011098.0, + "reward": 0.3125, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5705015659332275, + "sampling/importance_sampling_ratio/mean": 0.9996979236602783, + "sampling/importance_sampling_ratio/min": 0.6129493117332458, + "sampling/sampling_logp_difference/max": 0.4894731044769287, + "sampling/sampling_logp_difference/mean": 0.013286584988236427, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 181.640625, + "completions/mean_terminated_length": 181.640625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2341674566268921, + "epoch": 1.9436274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3454292313143326, + "kl": 0.08026061952114105, + "learning_rate": 3.3469237876833187e-07, + "loss": -0.0013, + "num_tokens": 50044403.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002331733703613, + "sampling/importance_sampling_ratio/min": 0.21306173503398895, + "sampling/sampling_logp_difference/max": 1.546173334121704, + "sampling/sampling_logp_difference/mean": 0.013525542803108692, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 207.4375, + "completions/mean_terminated_length": 207.4375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.23868905007839203, + "epoch": 1.9448529411764706, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3329696932606168, + "kl": 0.08665573596954346, + "learning_rate": 3.340202276927442e-07, + "loss": 0.1692, + "num_tokens": 50077727.0, + "reward": 0.8125, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5756419897079468, + "sampling/importance_sampling_ratio/mean": 0.9999165534973145, + "sampling/importance_sampling_ratio/min": 0.4954255223274231, + "sampling/sampling_logp_difference/max": 0.7023382186889648, + "sampling/sampling_logp_difference/mean": 0.014149514958262444, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 276.25, + "completions/mean_terminated_length": 276.25, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.32216912508010864, + "epoch": 1.946078431372549, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6251851960529853, + "kl": 0.10423454642295837, + "learning_rate": 3.333484135455792e-07, + "loss": 0.0141, + "num_tokens": 50115519.0, + "reward": -0.40625, + "reward_std": 0.5959457159042358, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5839189291000366, + "sampling/importance_sampling_ratio/mean": 1.0003249645233154, + "sampling/importance_sampling_ratio/min": 0.5713785290718079, + "sampling/sampling_logp_difference/max": 0.5597033500671387, + "sampling/sampling_logp_difference/mean": 0.01547469012439251, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 189.578125, + "completions/mean_terminated_length": 189.578125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3143174648284912, + "epoch": 1.9473039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.081170655936288, + "kl": 0.10240406543016434, + "learning_rate": 3.326769376905769e-07, + "loss": 0.0319, + "num_tokens": 50161412.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9267932176589966, + "sampling/importance_sampling_ratio/mean": 1.0005072355270386, + "sampling/importance_sampling_ratio/min": 0.5947045087814331, + "sampling/sampling_logp_difference/max": 0.6558570861816406, + "sampling/sampling_logp_difference/mean": 0.01755983754992485, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 255.65625, + "completions/mean_terminated_length": 255.65625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3196178674697876, + "epoch": 1.9485294117647058, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.792804478170327, + "kl": 0.09776392579078674, + "learning_rate": 3.3200580149079083e-07, + "loss": 0.0003, + "num_tokens": 50201534.0, + "reward": 0.34375, + "reward_std": 0.606805682182312, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.954484462738037, + "sampling/importance_sampling_ratio/mean": 0.9998239278793335, + "sampling/importance_sampling_ratio/min": 0.5243139863014221, + "sampling/sampling_logp_difference/max": 0.6701264381408691, + "sampling/sampling_logp_difference/mean": 0.016514722257852554, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 209.828125, + "completions/mean_terminated_length": 209.828125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.36181557178497314, + "epoch": 1.9497549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.619087280242418, + "kl": 0.11058124899864197, + "learning_rate": 3.31335006308585e-07, + "loss": 0.0166, + "num_tokens": 50230595.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6574147939682007, + "sampling/importance_sampling_ratio/mean": 1.0001424551010132, + "sampling/importance_sampling_ratio/min": 0.6203011274337769, + "sampling/sampling_logp_difference/max": 0.5052590370178223, + "sampling/sampling_logp_difference/mean": 0.017097918316721916, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 178.5625, + "completions/mean_terminated_length": 178.5625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.2967524826526642, + "epoch": 1.9509803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9463706558816324, + "kl": 0.1831379532814026, + "learning_rate": 3.3066455350563115e-07, + "loss": 0.0281, + "num_tokens": 50257351.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.638442039489746, + "sampling/importance_sampling_ratio/mean": 0.9994776844978333, + "sampling/importance_sampling_ratio/min": 0.44796988368034363, + "sampling/sampling_logp_difference/max": 0.8030292987823486, + "sampling/sampling_logp_difference/mean": 0.015726547688245773, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 166.796875, + "completions/mean_terminated_length": 166.796875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3216024339199066, + "epoch": 1.9522058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9469362686105434, + "kl": 0.12963071465492249, + "learning_rate": 3.29994444442906e-07, + "loss": 0.0012, + "num_tokens": 50285946.0, + "reward": 0.875, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.621140718460083, + "sampling/importance_sampling_ratio/mean": 1.000431776046753, + "sampling/importance_sampling_ratio/min": 0.6146537065505981, + "sampling/sampling_logp_difference/max": 0.4866962432861328, + "sampling/sampling_logp_difference/mean": 0.01588505692780018, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 195.0, + "completions/mean_terminated_length": 195.0, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.28989577293395996, + "epoch": 1.9534313725490198, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8948272853844492, + "kl": 0.13485756516456604, + "learning_rate": 3.2932468048068836e-07, + "loss": -0.0265, + "num_tokens": 50317930.0, + "reward": 0.1875, + "reward_std": 0.6531128883361816, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005075931549072, + "sampling/importance_sampling_ratio/min": 0.3008410334587097, + "sampling/sampling_logp_difference/max": 1.2011733055114746, + "sampling/sampling_logp_difference/mean": 0.016114819794893265, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 220.984375, + "completions/mean_terminated_length": 220.984375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2438029646873474, + "epoch": 1.954656862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6301769553324, + "kl": 0.10240459442138672, + "learning_rate": 3.2865526297855694e-07, + "loss": 0.0118, + "num_tokens": 50353417.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.7520787715911865, + "sampling/importance_sampling_ratio/mean": 1.0001596212387085, + "sampling/importance_sampling_ratio/min": 0.6250287294387817, + "sampling/sampling_logp_difference/max": 0.5608029365539551, + "sampling/sampling_logp_difference/mean": 0.012727048248052597, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 140.109375, + "completions/mean_terminated_length": 140.109375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.24194109439849854, + "epoch": 1.9558823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.382676593859849, + "kl": 0.11661958694458008, + "learning_rate": 3.2798619329538646e-07, + "loss": 0.0123, + "num_tokens": 50378544.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6962480545043945, + "sampling/importance_sampling_ratio/mean": 1.0009865760803223, + "sampling/importance_sampling_ratio/min": 0.5991002321243286, + "sampling/sampling_logp_difference/max": 0.528418779373169, + "sampling/sampling_logp_difference/mean": 0.014339487068355083, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 198.90625, + "completions/mean_terminated_length": 198.90625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3294591009616852, + "epoch": 1.9571078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.537652087376244, + "kl": 0.12539511919021606, + "learning_rate": 3.2731747278934623e-07, + "loss": 0.0314, + "num_tokens": 50411658.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994417428970337, + "sampling/importance_sampling_ratio/min": 0.4973803460597992, + "sampling/sampling_logp_difference/max": 1.0194251537322998, + "sampling/sampling_logp_difference/mean": 0.017773278057575226, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 229.078125, + "completions/mean_terminated_length": 229.078125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.24577781558036804, + "epoch": 1.9583333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3640381789741722, + "kl": 0.07902765274047852, + "learning_rate": 3.266491028178964e-07, + "loss": 0.0208, + "num_tokens": 50445311.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.7745686769485474, + "sampling/importance_sampling_ratio/mean": 1.000251054763794, + "sampling/importance_sampling_ratio/min": 0.3827075958251953, + "sampling/sampling_logp_difference/max": 0.9604840278625488, + "sampling/sampling_logp_difference/mean": 0.012837364338338375, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 200.015625, + "completions/mean_terminated_length": 200.015625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.20478859543800354, + "epoch": 1.9595588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041420986646954754, + "kl": 0.0642724260687828, + "learning_rate": 3.2598108473778595e-07, + "loss": 0.0006, + "num_tokens": 50475504.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6025726795196533, + "sampling/importance_sampling_ratio/mean": 0.9999085068702698, + "sampling/importance_sampling_ratio/min": 0.6138669848442078, + "sampling/sampling_logp_difference/max": 0.4879770278930664, + "sampling/sampling_logp_difference/mean": 0.011987491510808468, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 161.59375, + "completions/mean_terminated_length": 161.59375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.32719677686691284, + "epoch": 1.9607843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4560910886561118, + "kl": 0.13007999956607819, + "learning_rate": 3.253134199050489e-07, + "loss": 0.0307, + "num_tokens": 50503574.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5920594930648804, + "sampling/importance_sampling_ratio/mean": 0.9995177388191223, + "sampling/importance_sampling_ratio/min": 0.5006640553474426, + "sampling/sampling_logp_difference/max": 0.6918199062347412, + "sampling/sampling_logp_difference/mean": 0.016694029793143272, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 168.484375, + "completions/mean_terminated_length": 168.484375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2404722273349762, + "epoch": 1.9620098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20710936380695089, + "kl": 0.1067638099193573, + "learning_rate": 3.2464610967500273e-07, + "loss": 0.0011, + "num_tokens": 50534469.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7070449590682983, + "sampling/importance_sampling_ratio/mean": 1.0011273622512817, + "sampling/importance_sampling_ratio/min": 0.6255688667297363, + "sampling/sampling_logp_difference/max": 0.5347638130187988, + "sampling/sampling_logp_difference/mean": 0.014591801911592484, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 184.5, + "completions/mean_terminated_length": 184.5, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2132427841424942, + "epoch": 1.9632352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.303006906849855, + "kl": 0.07793626189231873, + "learning_rate": 3.239791554022449e-07, + "loss": 0.0146, + "num_tokens": 50565061.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6684820652008057, + "sampling/importance_sampling_ratio/mean": 0.9997797012329102, + "sampling/importance_sampling_ratio/min": 0.48260486125946045, + "sampling/sampling_logp_difference/max": 0.7285571098327637, + "sampling/sampling_logp_difference/mean": 0.013838795945048332, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 162.890625, + "completions/mean_terminated_length": 162.890625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2564378082752228, + "epoch": 1.9644607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.294506117895636, + "kl": 0.09519857168197632, + "learning_rate": 3.233125584406505e-07, + "loss": -0.0097, + "num_tokens": 50594606.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.988013505935669, + "sampling/importance_sampling_ratio/mean": 1.0003509521484375, + "sampling/importance_sampling_ratio/min": 0.6083920001983643, + "sampling/sampling_logp_difference/max": 0.6871359348297119, + "sampling/sampling_logp_difference/mean": 0.01461451593786478, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 172.53125, + "completions/mean_terminated_length": 172.53125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.1933905929327011, + "epoch": 1.965686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039783349055590174, + "kl": 0.06623965501785278, + "learning_rate": 3.226463201433688e-07, + "loss": 0.0006, + "num_tokens": 50625808.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8062872886657715, + "sampling/importance_sampling_ratio/mean": 0.9996256828308105, + "sampling/importance_sampling_ratio/min": 0.4821670353412628, + "sampling/sampling_logp_difference/max": 0.7294646501541138, + "sampling/sampling_logp_difference/mean": 0.01147723849862814, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 199.53125, + "completions/mean_terminated_length": 199.53125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2979511618614197, + "epoch": 1.9669117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9639861993875047, + "kl": 0.1098884642124176, + "learning_rate": 3.219804418628216e-07, + "loss": -0.0108, + "num_tokens": 50658978.0, + "reward": 0.71875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.7011988162994385, + "sampling/importance_sampling_ratio/mean": 0.999648928642273, + "sampling/importance_sampling_ratio/min": 0.4642985165119171, + "sampling/sampling_logp_difference/max": 0.7672275304794312, + "sampling/sampling_logp_difference/mean": 0.01718880608677864, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 166.421875, + "completions/mean_terminated_length": 166.421875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.21918423473834991, + "epoch": 1.968137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1573657034302032, + "kl": 0.07675355672836304, + "learning_rate": 3.2131492495069965e-07, + "loss": 0.022, + "num_tokens": 50690877.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6303189992904663, + "sampling/importance_sampling_ratio/mean": 1.0002782344818115, + "sampling/importance_sampling_ratio/min": 0.33144819736480713, + "sampling/sampling_logp_difference/max": 1.1042838096618652, + "sampling/sampling_logp_difference/mean": 0.012949703261256218, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 199.28125, + "completions/mean_terminated_length": 199.28125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2782513201236725, + "epoch": 1.969362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3616340941440406, + "kl": 0.09660226106643677, + "learning_rate": 3.206497707579598e-07, + "loss": -0.068, + "num_tokens": 50723055.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992948770523071, + "sampling/importance_sampling_ratio/min": 0.5910187363624573, + "sampling/sampling_logp_difference/max": 0.9737603664398193, + "sampling/sampling_logp_difference/mean": 0.015251495875418186, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 182.84375, + "completions/mean_terminated_length": 182.84375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.22558334469795227, + "epoch": 1.9705882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5858964919685112, + "kl": 0.07987121492624283, + "learning_rate": 3.199849806348233e-07, + "loss": -0.0067, + "num_tokens": 50753557.0, + "reward": 0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.4054455757141113, + "sampling/importance_sampling_ratio/mean": 0.9998422861099243, + "sampling/importance_sampling_ratio/min": 0.6121768355369568, + "sampling/sampling_logp_difference/max": 0.4907341003417969, + "sampling/sampling_logp_difference/mean": 0.013224356807768345, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 145.75, + "completions/mean_terminated_length": 145.75, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.26014161109924316, + "epoch": 1.971813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09850640337484572, + "kl": 0.12250128388404846, + "learning_rate": 3.1932055593077166e-07, + "loss": 0.0012, + "num_tokens": 50777813.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6067885160446167, + "sampling/importance_sampling_ratio/mean": 0.999258279800415, + "sampling/importance_sampling_ratio/min": 0.6246219277381897, + "sampling/sampling_logp_difference/max": 0.47423744201660156, + "sampling/sampling_logp_difference/mean": 0.015161692164838314, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 215.859375, + "completions/mean_terminated_length": 215.859375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.27312037348747253, + "epoch": 1.9730392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1018882184775826, + "kl": 0.06669708341360092, + "learning_rate": 3.186564979945453e-07, + "loss": 0.0007, + "num_tokens": 50810908.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.607377052307129, + "sampling/importance_sampling_ratio/mean": 0.9995588064193726, + "sampling/importance_sampling_ratio/min": 0.17369602620601654, + "sampling/sampling_logp_difference/max": 1.75044846534729, + "sampling/sampling_logp_difference/mean": 0.014527924358844757, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 253.8125, + "completions/mean_terminated_length": 253.8125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.21349847316741943, + "epoch": 1.9742647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2571214131193265, + "kl": 0.05080464482307434, + "learning_rate": 3.179928081741394e-07, + "loss": -0.0023, + "num_tokens": 50853984.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5920687913894653, + "sampling/importance_sampling_ratio/mean": 1.0000137090682983, + "sampling/importance_sampling_ratio/min": 0.6220495700836182, + "sampling/sampling_logp_difference/max": 0.4747354984283447, + "sampling/sampling_logp_difference/mean": 0.012062931433320045, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 230.875, + "completions/mean_terminated_length": 230.875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2812146246433258, + "epoch": 1.9754901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041661626205210384, + "kl": 0.07609853148460388, + "learning_rate": 3.173294878168025e-07, + "loss": 0.0008, + "num_tokens": 50885608.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5630258321762085, + "sampling/importance_sampling_ratio/mean": 1.0003044605255127, + "sampling/importance_sampling_ratio/min": 0.6308006644248962, + "sampling/sampling_logp_difference/max": 0.46076536178588867, + "sampling/sampling_logp_difference/mean": 0.014220191165804863, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 182.390625, + "completions/mean_terminated_length": 182.390625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3182164430618286, + "epoch": 1.9767156862745097, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8922180588213584, + "kl": 0.11925086379051208, + "learning_rate": 3.166665382690327e-07, + "loss": 0.0473, + "num_tokens": 50916465.0, + "reward": 0.21875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002799034118652, + "sampling/importance_sampling_ratio/min": 0.40446823835372925, + "sampling/sampling_logp_difference/max": 1.0331223011016846, + "sampling/sampling_logp_difference/mean": 0.019244685769081116, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 208.234375, + "completions/mean_terminated_length": 208.234375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.23336933553218842, + "epoch": 1.9779411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3085149064833899, + "kl": 0.07907348871231079, + "learning_rate": 3.1600396087657586e-07, + "loss": 0.0517, + "num_tokens": 50945344.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4848120212554932, + "sampling/importance_sampling_ratio/mean": 1.0001044273376465, + "sampling/importance_sampling_ratio/min": 0.5559201240539551, + "sampling/sampling_logp_difference/max": 0.5871306657791138, + "sampling/sampling_logp_difference/mean": 0.012774837203323841, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 239.75, + "completions/mean_terminated_length": 239.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.2610890865325928, + "epoch": 1.9791666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040694708580853736, + "kl": 0.07112009823322296, + "learning_rate": 3.153417569844219e-07, + "loss": 0.0007, + "num_tokens": 50982992.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6413822174072266, + "sampling/importance_sampling_ratio/mean": 0.9998020529747009, + "sampling/importance_sampling_ratio/min": 0.486931711435318, + "sampling/sampling_logp_difference/max": 0.7196313142776489, + "sampling/sampling_logp_difference/mean": 0.014887130819261074, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 196.34375, + "completions/mean_terminated_length": 196.34375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2693784534931183, + "epoch": 1.9803921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2412155777803517, + "kl": 0.09714090079069138, + "learning_rate": 3.1467992793680267e-07, + "loss": -0.0023, + "num_tokens": 51019750.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.7114192247390747, + "sampling/importance_sampling_ratio/mean": 1.0001533031463623, + "sampling/importance_sampling_ratio/min": 0.44735607504844666, + "sampling/sampling_logp_difference/max": 0.8044005036354065, + "sampling/sampling_logp_difference/mean": 0.015305680222809315, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 154.328125, + "completions/mean_terminated_length": 154.328125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.24011406302452087, + "epoch": 1.9816176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.330582147848195, + "kl": 0.1220964565873146, + "learning_rate": 3.140184750771895e-07, + "loss": 0.0123, + "num_tokens": 51046587.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.4753892421722412, + "sampling/importance_sampling_ratio/mean": 1.0000015497207642, + "sampling/importance_sampling_ratio/min": 0.605623185634613, + "sampling/sampling_logp_difference/max": 0.5014972686767578, + "sampling/sampling_logp_difference/mean": 0.013417389243841171, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 178.828125, + "completions/mean_terminated_length": 178.828125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2093338817358017, + "epoch": 1.982843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053016830426800644, + "kl": 0.08074681460857391, + "learning_rate": 3.133573997482896e-07, + "loss": 0.0007, + "num_tokens": 51081232.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996959567070007, + "sampling/importance_sampling_ratio/min": 0.41164177656173706, + "sampling/sampling_logp_difference/max": 0.9979722499847412, + "sampling/sampling_logp_difference/mean": 0.014013232663273811, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 157.359375, + "completions/mean_terminated_length": 157.359375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.23903775215148926, + "epoch": 1.9840686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04974164464841695, + "kl": 0.07109355926513672, + "learning_rate": 3.1269670329204393e-07, + "loss": 0.0007, + "num_tokens": 51111383.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6349729299545288, + "sampling/importance_sampling_ratio/mean": 0.9996782541275024, + "sampling/importance_sampling_ratio/min": 0.33583348989486694, + "sampling/sampling_logp_difference/max": 1.091139793395996, + "sampling/sampling_logp_difference/mean": 0.014817346818745136, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 139.53125, + "completions/mean_terminated_length": 139.53125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.22959363460540771, + "epoch": 1.9852941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.527513624288118, + "kl": 0.0815734714269638, + "learning_rate": 3.1203638704962465e-07, + "loss": -0.0098, + "num_tokens": 51137593.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001217126846313, + "sampling/importance_sampling_ratio/min": 0.4106025695800781, + "sampling/sampling_logp_difference/max": 1.313234806060791, + "sampling/sampling_logp_difference/mean": 0.014172550290822983, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 215.984375, + "completions/mean_terminated_length": 215.984375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2980981469154358, + "epoch": 1.9865196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3724718548416854, + "kl": 0.15632514655590057, + "learning_rate": 3.11376452361432e-07, + "loss": 0.0159, + "num_tokens": 51166440.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.597182273864746, + "sampling/importance_sampling_ratio/mean": 0.9992493391036987, + "sampling/importance_sampling_ratio/min": 0.6223132014274597, + "sampling/sampling_logp_difference/max": 0.47431182861328125, + "sampling/sampling_logp_difference/mean": 0.016231466084718704, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 203.078125, + "completions/mean_terminated_length": 203.078125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.18610015511512756, + "epoch": 1.9877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054405014400212094, + "kl": 0.07010403275489807, + "learning_rate": 3.107169005670912e-07, + "loss": 0.0007, + "num_tokens": 51194733.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6613489389419556, + "sampling/importance_sampling_ratio/mean": 0.9995604753494263, + "sampling/importance_sampling_ratio/min": 0.3985402286052704, + "sampling/sampling_logp_difference/max": 0.9199469089508057, + "sampling/sampling_logp_difference/mean": 0.012981155887246132, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 137.703125, + "completions/mean_terminated_length": 137.703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.20478081703186035, + "epoch": 1.9889705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.933138185478157, + "kl": 0.10997778922319412, + "learning_rate": 3.100577330054508e-07, + "loss": -0.0035, + "num_tokens": 51224362.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5855019092559814, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.4679851531982422, + "sampling/sampling_logp_difference/mean": 0.012617578729987144, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.31237688660621643, + "epoch": 1.9901960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.1662963505110064, + "kl": 0.14189650118350983, + "learning_rate": 3.0939895101457914e-07, + "loss": -0.0092, + "num_tokens": 51251210.0, + "reward": 0.6875, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6554615497589111, + "sampling/importance_sampling_ratio/mean": 1.00002121925354, + "sampling/importance_sampling_ratio/min": 0.6378210783004761, + "sampling/sampling_logp_difference/max": 0.5040798187255859, + "sampling/sampling_logp_difference/mean": 0.015452057123184204, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 285.578125, + "completions/mean_terminated_length": 285.578125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.16210177540779114, + "epoch": 1.991421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029328000825859252, + "kl": 0.05415792763233185, + "learning_rate": 3.087405559317622e-07, + "loss": 0.0005, + "num_tokens": 51286975.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.761606216430664, + "sampling/importance_sampling_ratio/mean": 0.9996249675750732, + "sampling/importance_sampling_ratio/min": 0.5097636580467224, + "sampling/sampling_logp_difference/max": 0.6738080978393555, + "sampling/sampling_logp_difference/mean": 0.009517800062894821, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 197.203125, + "completions/mean_terminated_length": 197.203125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.26196715235710144, + "epoch": 1.9926470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.022580389597765, + "kl": 0.10579611361026764, + "learning_rate": 3.0808254909349986e-07, + "loss": -0.0517, + "num_tokens": 51316620.0, + "reward": 0.1875, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.5474581718444824, + "sampling/sampling_logp_difference/max": 0.9265649318695068, + "sampling/sampling_logp_difference/mean": 0.01420481875538826, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 225.484375, + "completions/mean_terminated_length": 225.484375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.24070894718170166, + "epoch": 1.9938725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03621637216943933, + "kl": 0.05783402919769287, + "learning_rate": 3.0742493183550454e-07, + "loss": 0.0005, + "num_tokens": 51352907.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9146305322647095, + "sampling/importance_sampling_ratio/mean": 1.0004422664642334, + "sampling/importance_sampling_ratio/min": 0.5428375005722046, + "sampling/sampling_logp_difference/max": 0.6495246887207031, + "sampling/sampling_logp_difference/mean": 0.015581740997731686, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 206.3125, + "completions/mean_terminated_length": 206.3125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2880558967590332, + "epoch": 1.9950980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2085552696095494, + "kl": 0.10221179574728012, + "learning_rate": 3.0676770549269786e-07, + "loss": 0.009, + "num_tokens": 51387359.0, + "reward": 0.375, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.9623781442642212, + "sampling/importance_sampling_ratio/mean": 0.9996896386146545, + "sampling/importance_sampling_ratio/min": 0.5363050699234009, + "sampling/sampling_logp_difference/max": 0.6741571426391602, + "sampling/sampling_logp_difference/mean": 0.016578059643507004, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 196.25, + "completions/mean_terminated_length": 196.25, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.24638226628303528, + "epoch": 1.9963235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1180435070672479, + "kl": 0.09617805480957031, + "learning_rate": 3.0611087139920717e-07, + "loss": 0.0039, + "num_tokens": 51416639.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6157478094100952, + "sampling/importance_sampling_ratio/mean": 1.0001851320266724, + "sampling/importance_sampling_ratio/min": 0.49470096826553345, + "sampling/sampling_logp_difference/max": 0.7038018703460693, + "sampling/sampling_logp_difference/mean": 0.015872016549110413, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 178.890625, + "completions/mean_terminated_length": 178.890625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2153649926185608, + "epoch": 1.9975490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1406778974066792, + "kl": 0.06773126870393753, + "learning_rate": 3.054544308883643e-07, + "loss": 0.0007, + "num_tokens": 51447352.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5612868070602417, + "sampling/importance_sampling_ratio/mean": 0.9997575283050537, + "sampling/importance_sampling_ratio/min": 0.611184298992157, + "sampling/sampling_logp_difference/max": 0.4923567771911621, + "sampling/sampling_logp_difference/mean": 0.013232271187007427, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.24145972728729248, + "epoch": 1.9987745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4198898444810162, + "kl": 0.058622464537620544, + "learning_rate": 3.0479838529270186e-07, + "loss": -0.029, + "num_tokens": 51476880.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7522567510604858, + "sampling/importance_sampling_ratio/mean": 1.0002727508544922, + "sampling/importance_sampling_ratio/min": 0.48241639137268066, + "sampling/sampling_logp_difference/max": 0.728947639465332, + "sampling/sampling_logp_difference/mean": 0.01329050399363041, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 242.71875, + "completions/mean_terminated_length": 242.71875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2610500454902649, + "epoch": 2.0, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6373740660379532, + "kl": 0.11630761623382568, + "learning_rate": 3.0414273594395103e-07, + "loss": 0.0245, + "num_tokens": 51511198.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.784226894378662, + "sampling/importance_sampling_ratio/mean": 0.9999319911003113, + "sampling/importance_sampling_ratio/min": 0.631510317325592, + "sampling/sampling_logp_difference/max": 0.5789852142333984, + "sampling/sampling_logp_difference/mean": 0.013508956879377365, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 185.875, + "completions/mean_terminated_length": 185.875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2706983685493469, + "epoch": 2.0012254901960786, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3157672512576266, + "kl": 0.13925369083881378, + "learning_rate": 3.034874841730382e-07, + "loss": 0.0319, + "num_tokens": 51544934.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.9059497117996216, + "sampling/importance_sampling_ratio/mean": 0.9994834661483765, + "sampling/importance_sampling_ratio/min": 0.44495993852615356, + "sampling/sampling_logp_difference/max": 0.8097710609436035, + "sampling/sampling_logp_difference/mean": 0.015468169935047626, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 163.25, + "completions/mean_terminated_length": 163.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.24652785062789917, + "epoch": 2.002450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3515748376029462, + "kl": 0.1129264310002327, + "learning_rate": 3.0283263131008307e-07, + "loss": -0.0103, + "num_tokens": 51573478.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999708533287048, + "sampling/importance_sampling_ratio/min": 0.4943215250968933, + "sampling/sampling_logp_difference/max": 0.7337357997894287, + "sampling/sampling_logp_difference/mean": 0.016296112909913063, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 205.953125, + "completions/mean_terminated_length": 205.953125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.307706356048584, + "epoch": 2.0036764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0952524559695727, + "kl": 0.0964609831571579, + "learning_rate": 3.0217817868439545e-07, + "loss": 0.0052, + "num_tokens": 51601283.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6070095300674438, + "sampling/importance_sampling_ratio/mean": 0.9998152256011963, + "sampling/importance_sampling_ratio/min": 0.5948570966720581, + "sampling/sampling_logp_difference/max": 0.5194340944290161, + "sampling/sampling_logp_difference/mean": 0.015333171933889389, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 144.859375, + "completions/mean_terminated_length": 144.859375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.20275729894638062, + "epoch": 2.0049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10984783925805666, + "kl": 0.07397639751434326, + "learning_rate": 3.015241276244729e-07, + "loss": 0.0007, + "num_tokens": 51627770.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.627306342124939, + "sampling/importance_sampling_ratio/mean": 1.0005347728729248, + "sampling/importance_sampling_ratio/min": 0.44181501865386963, + "sampling/sampling_logp_difference/max": 0.816864013671875, + "sampling/sampling_logp_difference/mean": 0.013549655675888062, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 193.25, + "completions/mean_terminated_length": 193.25, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.20195108652114868, + "epoch": 2.0061274509803924, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3513810206626793, + "kl": 0.08534161746501923, + "learning_rate": 3.0087047945799724e-07, + "loss": 0.0043, + "num_tokens": 51654682.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6472231149673462, + "sampling/importance_sampling_ratio/mean": 1.0002919435501099, + "sampling/importance_sampling_ratio/min": 0.3821452260017395, + "sampling/sampling_logp_difference/max": 0.9619545936584473, + "sampling/sampling_logp_difference/mean": 0.012385329231619835, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 186.296875, + "completions/mean_terminated_length": 186.296875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.24972376227378845, + "epoch": 2.0073529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6237297199685563, + "kl": 0.11721492558717728, + "learning_rate": 3.002172355118331e-07, + "loss": 0.004, + "num_tokens": 51686381.0, + "reward": 0.5625, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4443892240524292, + "sampling/importance_sampling_ratio/mean": 1.0004124641418457, + "sampling/importance_sampling_ratio/min": 0.6345492601394653, + "sampling/sampling_logp_difference/max": 0.45484042167663574, + "sampling/sampling_logp_difference/mean": 0.013366533443331718, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 227.734375, + "completions/mean_terminated_length": 227.734375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.35807353258132935, + "epoch": 2.008578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.582531226173086, + "kl": 0.09623866528272629, + "learning_rate": 2.995643971120243e-07, + "loss": 0.0143, + "num_tokens": 51720076.0, + "reward": 0.75, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.7915570735931396, + "sampling/importance_sampling_ratio/mean": 1.0000855922698975, + "sampling/importance_sampling_ratio/min": 0.5658215284347534, + "sampling/sampling_logp_difference/max": 0.5830850601196289, + "sampling/sampling_logp_difference/mean": 0.017493925988674164, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 157.59375, + "completions/mean_terminated_length": 157.59375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.20872214436531067, + "epoch": 2.0098039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.527939124978404, + "kl": 0.1340126097202301, + "learning_rate": 2.9891196558379126e-07, + "loss": 0.0012, + "num_tokens": 51747458.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4740839004516602, + "sampling/importance_sampling_ratio/mean": 1.0000025033950806, + "sampling/importance_sampling_ratio/min": 0.39433911442756653, + "sampling/sampling_logp_difference/max": 0.9305441379547119, + "sampling/sampling_logp_difference/mean": 0.013612005859613419, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 177.0, + "completions/mean_terminated_length": 177.0, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.20791053771972656, + "epoch": 2.011029411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32638084040683657, + "kl": 0.1111663281917572, + "learning_rate": 2.9825994225152884e-07, + "loss": 0.0011, + "num_tokens": 51775074.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8181694746017456, + "sampling/importance_sampling_ratio/mean": 0.999299168586731, + "sampling/importance_sampling_ratio/min": 0.45959511399269104, + "sampling/sampling_logp_difference/max": 0.7774093151092529, + "sampling/sampling_logp_difference/mean": 0.013892569579184055, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 165.78125, + "completions/mean_terminated_length": 165.78125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.2592073380947113, + "epoch": 2.0122549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.3884884387292047, + "kl": 0.09146402031183243, + "learning_rate": 2.976083284388031e-07, + "loss": 0.0709, + "num_tokens": 51804052.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4387812614440918, + "sampling/importance_sampling_ratio/mean": 0.9987965226173401, + "sampling/importance_sampling_ratio/min": 0.5887529253959656, + "sampling/sampling_logp_difference/max": 0.5297486782073975, + "sampling/sampling_logp_difference/mean": 0.015599111095070839, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.2273353636264801, + "epoch": 2.013480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0405693698287233, + "kl": 0.06104673817753792, + "learning_rate": 2.9695712546834885e-07, + "loss": -0.0192, + "num_tokens": 51849042.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6454516649246216, + "sampling/importance_sampling_ratio/mean": 0.9998835325241089, + "sampling/importance_sampling_ratio/min": 0.1766490489244461, + "sampling/sampling_logp_difference/max": 1.7335902452468872, + "sampling/sampling_logp_difference/mean": 0.013499276712536812, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 180.59375, + "completions/mean_terminated_length": 180.59375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.21701395511627197, + "epoch": 2.014705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050001106355846855, + "kl": 0.08167947828769684, + "learning_rate": 2.9630633466206655e-07, + "loss": 0.0008, + "num_tokens": 51881640.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6478513479232788, + "sampling/importance_sampling_ratio/mean": 1.0004024505615234, + "sampling/importance_sampling_ratio/min": 0.39870205521583557, + "sampling/sampling_logp_difference/max": 0.9195408821105957, + "sampling/sampling_logp_difference/mean": 0.014474079012870789, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 165.53125, + "completions/mean_terminated_length": 165.53125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.25972750782966614, + "epoch": 2.0159313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.476203307790272, + "kl": 0.12379482388496399, + "learning_rate": 2.9565595734102043e-07, + "loss": -0.0181, + "num_tokens": 51909386.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6418057680130005, + "sampling/importance_sampling_ratio/mean": 1.0005497932434082, + "sampling/importance_sampling_ratio/min": 0.5912330746650696, + "sampling/sampling_logp_difference/max": 0.5255449414253235, + "sampling/sampling_logp_difference/mean": 0.016285490244627, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 173.53125, + "completions/mean_terminated_length": 173.53125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.22746597230434418, + "epoch": 2.017156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2127214914639128, + "kl": 0.12575247883796692, + "learning_rate": 2.950059948254355e-07, + "loss": -0.0012, + "num_tokens": 51938028.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.4987714886665344, + "sampling/sampling_logp_difference/max": 0.8758199214935303, + "sampling/sampling_logp_difference/mean": 0.01391543261706829, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 221.625, + "completions/mean_terminated_length": 221.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2744201421737671, + "epoch": 2.0183823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5979122875807548, + "kl": 0.06626914441585541, + "learning_rate": 2.943564484346943e-07, + "loss": 0.0352, + "num_tokens": 51972324.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6574277877807617, + "sampling/importance_sampling_ratio/mean": 1.0007280111312866, + "sampling/importance_sampling_ratio/min": 0.5644875764846802, + "sampling/sampling_logp_difference/max": 0.5718369483947754, + "sampling/sampling_logp_difference/mean": 0.013680658303201199, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 182.65625, + "completions/mean_terminated_length": 182.65625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2576046586036682, + "epoch": 2.019607843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6970821857781622, + "kl": 0.10263028740882874, + "learning_rate": 2.937073194873348e-07, + "loss": -0.0233, + "num_tokens": 52002846.0, + "reward": 0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999614953994751, + "sampling/importance_sampling_ratio/min": 0.6093730926513672, + "sampling/sampling_logp_difference/max": 0.7053864002227783, + "sampling/sampling_logp_difference/mean": 0.015043235383927822, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 217.34375, + "completions/mean_terminated_length": 217.34375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3107795715332031, + "epoch": 2.0208333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045661501611413365, + "kl": 0.07256503403186798, + "learning_rate": 2.930586093010477e-07, + "loss": 0.0007, + "num_tokens": 52032532.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4629426002502441, + "sampling/importance_sampling_ratio/mean": 0.9996424317359924, + "sampling/importance_sampling_ratio/min": 0.4803723096847534, + "sampling/sampling_logp_difference/max": 0.7331938743591309, + "sampling/sampling_logp_difference/mean": 0.015414186753332615, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 226.890625, + "completions/mean_terminated_length": 226.890625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3023676872253418, + "epoch": 2.0220588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5121507858951995, + "kl": 0.12815862894058228, + "learning_rate": 2.9241031919267363e-07, + "loss": -0.0428, + "num_tokens": 52061133.0, + "reward": 0.4375, + "reward_std": 0.5081988573074341, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.504328966140747, + "sampling/importance_sampling_ratio/mean": 0.9998545050621033, + "sampling/importance_sampling_ratio/min": 0.14129947125911713, + "sampling/sampling_logp_difference/max": 1.9568736553192139, + "sampling/sampling_logp_difference/mean": 0.01576872542500496, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 204.53125, + "completions/mean_terminated_length": 204.53125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.33197981119155884, + "epoch": 2.0232843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7555989411861663, + "kl": 0.11538799107074738, + "learning_rate": 2.917624504782006e-07, + "loss": 0.0021, + "num_tokens": 52098767.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996583461761475, + "sampling/importance_sampling_ratio/min": 0.3384450078010559, + "sampling/sampling_logp_difference/max": 1.0833935737609863, + "sampling/sampling_logp_difference/mean": 0.01745392009615898, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 150.171875, + "completions/mean_terminated_length": 150.171875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2159707099199295, + "epoch": 2.0245098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05194142319735925, + "kl": 0.07415338605642319, + "learning_rate": 2.911150044727605e-07, + "loss": 0.0007, + "num_tokens": 52131466.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6623896360397339, + "sampling/importance_sampling_ratio/mean": 1.0000048875808716, + "sampling/importance_sampling_ratio/min": 0.5678412318229675, + "sampling/sampling_logp_difference/max": 0.5659134387969971, + "sampling/sampling_logp_difference/mean": 0.014511508867144585, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 142.234375, + "completions/mean_terminated_length": 142.234375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.194686159491539, + "epoch": 2.025735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6919832040340759, + "kl": 0.07016637176275253, + "learning_rate": 2.9046798249062824e-07, + "loss": -0.0052, + "num_tokens": 52161481.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6222556829452515, + "sampling/importance_sampling_ratio/mean": 1.0002816915512085, + "sampling/importance_sampling_ratio/min": 0.5483796000480652, + "sampling/sampling_logp_difference/max": 0.6007875204086304, + "sampling/sampling_logp_difference/mean": 0.013081444427371025, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 207.828125, + "completions/mean_terminated_length": 207.828125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.25522762537002563, + "epoch": 2.0269607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06765847310099816, + "kl": 0.0819954201579094, + "learning_rate": 2.898213858452173e-07, + "loss": 0.0008, + "num_tokens": 52193854.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9768575429916382, + "sampling/importance_sampling_ratio/mean": 0.9995952248573303, + "sampling/importance_sampling_ratio/min": 0.37231582403182983, + "sampling/sampling_logp_difference/max": 0.9880127906799316, + "sampling/sampling_logp_difference/mean": 0.015854569151997566, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 180.421875, + "completions/mean_terminated_length": 180.421875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.23425695300102234, + "epoch": 2.028186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.988364758623323, + "kl": 0.09826873242855072, + "learning_rate": 2.891752158490778e-07, + "loss": -0.0345, + "num_tokens": 52222905.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4792793989181519, + "sampling/importance_sampling_ratio/mean": 0.99944669008255, + "sampling/importance_sampling_ratio/min": 0.5910207033157349, + "sampling/sampling_logp_difference/max": 0.5259042382240295, + "sampling/sampling_logp_difference/mean": 0.012566267512738705, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 288.609375, + "completions/mean_terminated_length": 288.609375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.29428747296333313, + "epoch": 2.0294117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.109729955289726, + "kl": 0.10290320217609406, + "learning_rate": 2.8852947381389405e-07, + "loss": 0.0009, + "num_tokens": 52261184.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971866607666, + "sampling/importance_sampling_ratio/min": 0.4756595194339752, + "sampling/sampling_logp_difference/max": 0.7430529594421387, + "sampling/sampling_logp_difference/mean": 0.014198487624526024, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 232.375, + "completions/mean_terminated_length": 232.375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.26211094856262207, + "epoch": 2.030637254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1851833220447131, + "kl": 0.06994898617267609, + "learning_rate": 2.8788416105048117e-07, + "loss": 0.0007, + "num_tokens": 52298632.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5943752527236938, + "sampling/importance_sampling_ratio/mean": 1.000478982925415, + "sampling/importance_sampling_ratio/min": 0.6649942994117737, + "sampling/sampling_logp_difference/max": 0.46648192405700684, + "sampling/sampling_logp_difference/mean": 0.014292774721980095, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 255.125, + "completions/mean_terminated_length": 255.125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2893081307411194, + "epoch": 2.031862745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.480910825526405, + "kl": 0.0972118228673935, + "learning_rate": 2.8723927886878396e-07, + "loss": -0.0385, + "num_tokens": 52334032.0, + "reward": 0.65625, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6980862617492676, + "sampling/importance_sampling_ratio/mean": 0.9996001124382019, + "sampling/importance_sampling_ratio/min": 0.4472591280937195, + "sampling/sampling_logp_difference/max": 0.804617166519165, + "sampling/sampling_logp_difference/mean": 0.01658753678202629, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 137.859375, + "completions/mean_terminated_length": 137.859375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.18066023290157318, + "epoch": 2.0330882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06619401552970619, + "kl": 0.06539852917194366, + "learning_rate": 2.865948285778713e-07, + "loss": 0.0007, + "num_tokens": 52355015.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8713295459747314, + "sampling/importance_sampling_ratio/mean": 1.0002483129501343, + "sampling/importance_sampling_ratio/min": 0.16636940836906433, + "sampling/sampling_logp_difference/max": 1.7935445308685303, + "sampling/sampling_logp_difference/mean": 0.012770957313477993, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 203.109375, + "completions/mean_terminated_length": 203.109375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2152371108531952, + "epoch": 2.034313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.309699021857678, + "kl": 0.05949123576283455, + "learning_rate": 2.8595081148593737e-07, + "loss": 0.008, + "num_tokens": 52385950.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6382622718811035, + "sampling/importance_sampling_ratio/mean": 0.9996830821037292, + "sampling/importance_sampling_ratio/min": 0.6056292653083801, + "sampling/sampling_logp_difference/max": 0.5014872550964355, + "sampling/sampling_logp_difference/mean": 0.012422558851540089, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 196.859375, + "completions/mean_terminated_length": 196.859375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.26284027099609375, + "epoch": 2.0355392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0482797283796569, + "kl": 0.091962069272995, + "learning_rate": 2.8530722890029534e-07, + "loss": 0.0009, + "num_tokens": 52414773.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6307573318481445, + "sampling/importance_sampling_ratio/mean": 0.9998902082443237, + "sampling/importance_sampling_ratio/min": 0.2332911193370819, + "sampling/sampling_logp_difference/max": 1.4554681777954102, + "sampling/sampling_logp_difference/mean": 0.015172924846410751, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 178.046875, + "completions/mean_terminated_length": 178.046875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2818187475204468, + "epoch": 2.036764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7199197507014743, + "kl": 0.13522930443286896, + "learning_rate": 2.8466408212737776e-07, + "loss": 0.0245, + "num_tokens": 52441512.0, + "reward": 0.21875, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.705733299255371, + "sampling/importance_sampling_ratio/mean": 1.0003222227096558, + "sampling/importance_sampling_ratio/min": 0.62389075756073, + "sampling/sampling_logp_difference/max": 0.5339951515197754, + "sampling/sampling_logp_difference/mean": 0.014451962895691395, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 242.5, + "completions/mean_terminated_length": 242.5, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.2337619662284851, + "epoch": 2.0379901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.369306100618678, + "kl": 0.10952450335025787, + "learning_rate": 2.840213724727315e-07, + "loss": 0.0113, + "num_tokens": 52473832.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6355764865875244, + "sampling/importance_sampling_ratio/mean": 1.000237226486206, + "sampling/importance_sampling_ratio/min": 0.010155175812542439, + "sampling/sampling_logp_difference/max": 4.589771747589111, + "sampling/sampling_logp_difference/mean": 0.012999322265386581, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 258.328125, + "completions/mean_terminated_length": 258.328125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.24671608209609985, + "epoch": 2.0392156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.085051997504276, + "kl": 0.06979397684335709, + "learning_rate": 2.8337910124101625e-07, + "loss": -0.0498, + "num_tokens": 52506429.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002784729003906, + "sampling/importance_sampling_ratio/min": 0.35105717182159424, + "sampling/sampling_logp_difference/max": 1.5489122867584229, + "sampling/sampling_logp_difference/mean": 0.014655955135822296, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 207.640625, + "completions/mean_terminated_length": 207.640625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.2997831404209137, + "epoch": 2.0404411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.323549977186844, + "kl": 0.18803086876869202, + "learning_rate": 2.8273726973600254e-07, + "loss": 0.0101, + "num_tokens": 52540662.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.99234139919281, + "sampling/importance_sampling_ratio/mean": 1.0006390810012817, + "sampling/importance_sampling_ratio/min": 0.5900681018829346, + "sampling/sampling_logp_difference/max": 0.6893105506896973, + "sampling/sampling_logp_difference/mean": 0.015998469665646553, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 191.390625, + "completions/mean_terminated_length": 191.390625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.21080835163593292, + "epoch": 2.0416666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056386538308938354, + "kl": 0.06706511229276657, + "learning_rate": 2.8209587926056687e-07, + "loss": 0.0007, + "num_tokens": 52573439.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.566491961479187, + "sampling/importance_sampling_ratio/mean": 1.0001158714294434, + "sampling/importance_sampling_ratio/min": 0.5505178570747375, + "sampling/sampling_logp_difference/max": 0.5968958139419556, + "sampling/sampling_logp_difference/mean": 0.012801194563508034, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 206.6875, + "completions/mean_terminated_length": 206.6875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2314741015434265, + "epoch": 2.042892156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5137123224282574, + "kl": 0.11053841561079025, + "learning_rate": 2.8145493111669183e-07, + "loss": -0.0041, + "num_tokens": 52602443.0, + "reward": 0.78125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5832947492599487, + "sampling/importance_sampling_ratio/mean": 0.999763011932373, + "sampling/importance_sampling_ratio/min": 0.29653459787368774, + "sampling/sampling_logp_difference/max": 1.2155914306640625, + "sampling/sampling_logp_difference/mean": 0.01283535547554493, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 261.234375, + "completions/mean_terminated_length": 261.234375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.21586529910564423, + "epoch": 2.0441176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04806692302841265, + "kl": 0.058751266449689865, + "learning_rate": 2.808144266054612e-07, + "loss": 0.0006, + "num_tokens": 52639386.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6159850358963013, + "sampling/importance_sampling_ratio/mean": 0.9997762441635132, + "sampling/importance_sampling_ratio/min": 0.5676490068435669, + "sampling/sampling_logp_difference/max": 0.5662519931793213, + "sampling/sampling_logp_difference/mean": 0.011225221678614616, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 256.6875, + "completions/mean_terminated_length": 256.6875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.27154284715652466, + "epoch": 2.045343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8431077027344933, + "kl": 0.08351112902164459, + "learning_rate": 2.80174367027059e-07, + "loss": 0.0692, + "num_tokens": 52670438.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6037511825561523, + "sampling/importance_sampling_ratio/mean": 0.9998252391815186, + "sampling/importance_sampling_ratio/min": 0.5498784780502319, + "sampling/sampling_logp_difference/max": 0.5980579853057861, + "sampling/sampling_logp_difference/mean": 0.013166810385882854, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 193.40625, + "completions/mean_terminated_length": 193.40625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.30176496505737305, + "epoch": 2.0465686274509802, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8367215287667253, + "kl": 0.14671970903873444, + "learning_rate": 2.795347536807653e-07, + "loss": 0.0144, + "num_tokens": 52696944.0, + "reward": -0.21875, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.568989872932434, + "sampling/importance_sampling_ratio/mean": 1.0002939701080322, + "sampling/importance_sampling_ratio/min": 0.5676478147506714, + "sampling/sampling_logp_difference/max": 0.5662540197372437, + "sampling/sampling_logp_difference/mean": 0.015668369829654694, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 222.390625, + "completions/mean_terminated_length": 222.390625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3677259683609009, + "epoch": 2.047794117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0241763121119458, + "kl": 0.11141422390937805, + "learning_rate": 2.7889558786495455e-07, + "loss": -0.0033, + "num_tokens": 52727481.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7051466703414917, + "sampling/importance_sampling_ratio/mean": 0.9996929168701172, + "sampling/importance_sampling_ratio/min": 0.5259426236152649, + "sampling/sampling_logp_difference/max": 0.6425632238388062, + "sampling/sampling_logp_difference/mean": 0.017417607828974724, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 224.5625, + "completions/mean_terminated_length": 224.5625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.22942262887954712, + "epoch": 2.049019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1211021324738104, + "kl": 0.07600408792495728, + "learning_rate": 2.782568708770933e-07, + "loss": 0.0435, + "num_tokens": 52759661.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6190972328186035, + "sampling/importance_sampling_ratio/mean": 1.000002145767212, + "sampling/importance_sampling_ratio/min": 0.5999808311462402, + "sampling/sampling_logp_difference/max": 0.5108575820922852, + "sampling/sampling_logp_difference/mean": 0.012174902483820915, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 230.015625, + "completions/mean_terminated_length": 230.015625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.21251729130744934, + "epoch": 2.0502450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2681871459692562, + "kl": 0.07802058756351471, + "learning_rate": 2.7761860401373627e-07, + "loss": -0.0245, + "num_tokens": 52791774.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000063419342041, + "sampling/importance_sampling_ratio/min": 0.3121943771839142, + "sampling/sampling_logp_difference/max": 1.1641292572021484, + "sampling/sampling_logp_difference/mean": 0.012759133242070675, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 289.390625, + "completions/mean_terminated_length": 289.390625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.24743464589118958, + "epoch": 2.051470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.958409340880211, + "kl": 0.07729962468147278, + "learning_rate": 2.7698078857052474e-07, + "loss": 0.0083, + "num_tokens": 52824631.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5117584466934204, + "sampling/importance_sampling_ratio/mean": 1.0005438327789307, + "sampling/importance_sampling_ratio/min": 0.5339420437812805, + "sampling/sampling_logp_difference/max": 0.627467930316925, + "sampling/sampling_logp_difference/mean": 0.012405122630298138, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 225.96875, + "completions/mean_terminated_length": 225.96875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.22162757813930511, + "epoch": 2.0526960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4250513679668029, + "kl": 0.07321614772081375, + "learning_rate": 2.763434258421836e-07, + "loss": -0.0132, + "num_tokens": 52856869.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009355545043945, + "sampling/importance_sampling_ratio/min": 0.5484393835067749, + "sampling/sampling_logp_difference/max": 1.0633785724639893, + "sampling/sampling_logp_difference/mean": 0.012057576328516006, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 217.609375, + "completions/mean_terminated_length": 217.609375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.21669289469718933, + "epoch": 2.053921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0402609594328909, + "kl": 0.0873531922698021, + "learning_rate": 2.757065171225192e-07, + "loss": 0.0008, + "num_tokens": 52885676.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7586616277694702, + "sampling/importance_sampling_ratio/mean": 1.0000011920928955, + "sampling/importance_sampling_ratio/min": 0.4062422215938568, + "sampling/sampling_logp_difference/max": 0.9008057117462158, + "sampling/sampling_logp_difference/mean": 0.012449707835912704, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 233.3125, + "completions/mean_terminated_length": 233.3125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.24843087792396545, + "epoch": 2.0551470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.216347259289768, + "kl": 0.08403472602367401, + "learning_rate": 2.750700637044155e-07, + "loss": -0.1293, + "num_tokens": 52916912.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996212720870972, + "sampling/importance_sampling_ratio/min": 0.5261399745941162, + "sampling/sampling_logp_difference/max": 0.7840127944946289, + "sampling/sampling_logp_difference/mean": 0.013490501791238785, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 284.015625, + "completions/mean_terminated_length": 284.015625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.2866605222225189, + "epoch": 2.0563725490196076, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0521726104314784, + "kl": 0.061432041227817535, + "learning_rate": 2.7443406687983264e-07, + "loss": 0.0113, + "num_tokens": 52955777.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.528875708580017, + "sampling/importance_sampling_ratio/mean": 0.9994683265686035, + "sampling/importance_sampling_ratio/min": 0.5874800086021423, + "sampling/sampling_logp_difference/max": 0.5319130420684814, + "sampling/sampling_logp_difference/mean": 0.014019916765391827, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 203.375, + "completions/mean_terminated_length": 203.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.29307612776756287, + "epoch": 2.0575980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4559984618208732, + "kl": 0.09767642617225647, + "learning_rate": 2.7379852793980416e-07, + "loss": -0.0363, + "num_tokens": 52985865.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5886762142181396, + "sampling/importance_sampling_ratio/mean": 0.9998399615287781, + "sampling/importance_sampling_ratio/min": 0.6176496744155884, + "sampling/sampling_logp_difference/max": 0.48183393478393555, + "sampling/sampling_logp_difference/mean": 0.013554751873016357, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 291.375, + "completions/mean_terminated_length": 291.375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3286081552505493, + "epoch": 2.0588235294117645, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.605307084978014, + "kl": 0.09625139832496643, + "learning_rate": 2.7316344817443363e-07, + "loss": -0.0145, + "num_tokens": 53023153.0, + "reward": 0.15625, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.8163264989852905, + "sampling/importance_sampling_ratio/mean": 0.999686062335968, + "sampling/importance_sampling_ratio/min": 0.6147370934486389, + "sampling/sampling_logp_difference/max": 0.5968160629272461, + "sampling/sampling_logp_difference/mean": 0.015223829075694084, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 218.40625, + "completions/mean_terminated_length": 218.40625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2261694371700287, + "epoch": 2.060049019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0154322061573835, + "kl": 0.09540575742721558, + "learning_rate": 2.7252882887289287e-07, + "loss": 0.0049, + "num_tokens": 53053099.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5433379411697388, + "sampling/importance_sampling_ratio/mean": 1.000070571899414, + "sampling/importance_sampling_ratio/min": 0.5636303424835205, + "sampling/sampling_logp_difference/max": 0.5733566284179688, + "sampling/sampling_logp_difference/mean": 0.011534119956195354, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 167.9375, + "completions/mean_terminated_length": 167.9375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.2440802901983261, + "epoch": 2.0612745098039214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06137228801321598, + "kl": 0.08271975815296173, + "learning_rate": 2.718946713234185e-07, + "loss": 0.0008, + "num_tokens": 53079223.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.570361852645874, + "sampling/importance_sampling_ratio/mean": 0.9998970627784729, + "sampling/importance_sampling_ratio/min": 0.6176997423171997, + "sampling/sampling_logp_difference/max": 0.481752872467041, + "sampling/sampling_logp_difference/mean": 0.013783842325210571, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.2853248119354248, + "epoch": 2.0625, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8566245712168283, + "kl": 0.09553046524524689, + "learning_rate": 2.712609768133106e-07, + "loss": 0.0116, + "num_tokens": 53120495.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6980427503585815, + "sampling/importance_sampling_ratio/mean": 0.999942421913147, + "sampling/importance_sampling_ratio/min": 0.48238492012023926, + "sampling/sampling_logp_difference/max": 0.7290129661560059, + "sampling/sampling_logp_difference/mean": 0.014373978599905968, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 225.390625, + "completions/mean_terminated_length": 225.390625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.28901052474975586, + "epoch": 2.063725490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9639789878845415, + "kl": 0.08507464826107025, + "learning_rate": 2.7062774662892886e-07, + "loss": -0.0054, + "num_tokens": 53156264.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6519145965576172, + "sampling/importance_sampling_ratio/mean": 1.0008249282836914, + "sampling/importance_sampling_ratio/min": 0.4598377048969269, + "sampling/sampling_logp_difference/max": 0.7768816947937012, + "sampling/sampling_logp_difference/mean": 0.014761666767299175, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 267.40625, + "completions/mean_terminated_length": 267.40625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.2973840534687042, + "epoch": 2.064950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0715930586265272, + "kl": 0.12841664254665375, + "learning_rate": 2.6999498205569e-07, + "loss": 0.04, + "num_tokens": 53191410.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.9099645614624023, + "sampling/importance_sampling_ratio/mean": 0.9996175169944763, + "sampling/importance_sampling_ratio/min": 0.480032742023468, + "sampling/sampling_logp_difference/max": 0.7339010238647461, + "sampling/sampling_logp_difference/mean": 0.016689680516719818, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 244.359375, + "completions/mean_terminated_length": 244.359375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.28265005350112915, + "epoch": 2.0661764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.889216893984033, + "kl": 0.10497517883777618, + "learning_rate": 2.693626843780665e-07, + "loss": -0.0013, + "num_tokens": 53223289.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6166887283325195, + "sampling/importance_sampling_ratio/mean": 1.000554084777832, + "sampling/importance_sampling_ratio/min": 0.6457998752593994, + "sampling/sampling_logp_difference/max": 0.4803800582885742, + "sampling/sampling_logp_difference/mean": 0.013796938583254814, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 206.015625, + "completions/mean_terminated_length": 206.015625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.2224217653274536, + "epoch": 2.0674019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05156404446184419, + "kl": 0.0919741690158844, + "learning_rate": 2.687308548795825e-07, + "loss": 0.0009, + "num_tokens": 53256058.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6253172159194946, + "sampling/importance_sampling_ratio/mean": 1.00070059299469, + "sampling/importance_sampling_ratio/min": 0.3588513731956482, + "sampling/sampling_logp_difference/max": 1.0248470306396484, + "sampling/sampling_logp_difference/mean": 0.013268515467643738, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 215.71875, + "completions/mean_terminated_length": 215.71875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.28397154808044434, + "epoch": 2.0686274509803924, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4321573737376183, + "kl": 0.10913024842739105, + "learning_rate": 2.6809949484281164e-07, + "loss": -0.0355, + "num_tokens": 53296264.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9980440139770508, + "sampling/importance_sampling_ratio/mean": 0.9999949932098389, + "sampling/importance_sampling_ratio/min": 0.36637455224990845, + "sampling/sampling_logp_difference/max": 1.0040991306304932, + "sampling/sampling_logp_difference/mean": 0.016209449619054794, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 182.953125, + "completions/mean_terminated_length": 182.953125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.22761885821819305, + "epoch": 2.0698529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2727016130142637, + "kl": 0.11536600440740585, + "learning_rate": 2.674686055493748e-07, + "loss": 0.0002, + "num_tokens": 53325733.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.573062539100647, + "sampling/importance_sampling_ratio/mean": 1.0001211166381836, + "sampling/importance_sampling_ratio/min": 0.5370011925697327, + "sampling/sampling_logp_difference/max": 0.6217550039291382, + "sampling/sampling_logp_difference/mean": 0.013285119086503983, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 197.921875, + "completions/mean_terminated_length": 197.921875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2960512042045593, + "epoch": 2.071078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1194285862158084, + "kl": 0.10327520221471786, + "learning_rate": 2.668381882799375e-07, + "loss": -0.0157, + "num_tokens": 53356320.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7529429197311401, + "sampling/importance_sampling_ratio/mean": 0.9996784925460815, + "sampling/importance_sampling_ratio/min": 0.5263916254043579, + "sampling/sampling_logp_difference/max": 0.6417098045349121, + "sampling/sampling_logp_difference/mean": 0.01602327823638916, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 197.640625, + "completions/mean_terminated_length": 197.640625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.2522090673446655, + "epoch": 2.0723039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3949340716952856, + "kl": 0.0917755737900734, + "learning_rate": 2.662082443142068e-07, + "loss": 0.0016, + "num_tokens": 53385241.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5211591720581055, + "sampling/importance_sampling_ratio/mean": 1.0000898838043213, + "sampling/importance_sampling_ratio/min": 0.5760922431945801, + "sampling/sampling_logp_difference/max": 0.5514874458312988, + "sampling/sampling_logp_difference/mean": 0.01480923593044281, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 241.140625, + "completions/mean_terminated_length": 241.140625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2808194160461426, + "epoch": 2.073529411764706, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8278898187120234, + "kl": 0.09493476897478104, + "learning_rate": 2.6557877493092883e-07, + "loss": 0.0219, + "num_tokens": 53417746.0, + "reward": 0.3125, + "reward_std": 0.6645200252532959, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6169688701629639, + "sampling/importance_sampling_ratio/mean": 0.9999876022338867, + "sampling/importance_sampling_ratio/min": 0.601413905620575, + "sampling/sampling_logp_difference/max": 0.5084719657897949, + "sampling/sampling_logp_difference/mean": 0.01424361951649189, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 245.359375, + "completions/mean_terminated_length": 245.359375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2270507961511612, + "epoch": 2.0747549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2972264315716464, + "kl": 0.07362878322601318, + "learning_rate": 2.6494978140788686e-07, + "loss": 0.0472, + "num_tokens": 53449753.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6605846881866455, + "sampling/importance_sampling_ratio/mean": 1.000180959701538, + "sampling/importance_sampling_ratio/min": 0.6133704781532288, + "sampling/sampling_logp_difference/max": 0.5071697235107422, + "sampling/sampling_logp_difference/mean": 0.01344931311905384, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 228.703125, + "completions/mean_terminated_length": 228.703125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.27688461542129517, + "epoch": 2.075980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2010355458651496, + "kl": 0.08339104056358337, + "learning_rate": 2.643212650218976e-07, + "loss": 0.0019, + "num_tokens": 53482918.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.589503288269043, + "sampling/importance_sampling_ratio/mean": 0.9999773502349854, + "sampling/importance_sampling_ratio/min": 0.4946943521499634, + "sampling/sampling_logp_difference/max": 0.703815221786499, + "sampling/sampling_logp_difference/mean": 0.014928510412573814, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 238.765625, + "completions/mean_terminated_length": 238.765625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.2972226142883301, + "epoch": 2.077205882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.024096463484243, + "kl": 0.10847216844558716, + "learning_rate": 2.6369322704881e-07, + "loss": 0.0235, + "num_tokens": 53519671.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6023660898208618, + "sampling/importance_sampling_ratio/mean": 1.0001966953277588, + "sampling/importance_sampling_ratio/min": 0.405647337436676, + "sampling/sampling_logp_difference/max": 0.9022711515426636, + "sampling/sampling_logp_difference/mean": 0.01701277121901512, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 263.625, + "completions/mean_terminated_length": 263.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.23059415817260742, + "epoch": 2.0784313725490198, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3897163961565249, + "kl": 0.08193601667881012, + "learning_rate": 2.6306566876350067e-07, + "loss": 0.0239, + "num_tokens": 53562047.0, + "reward": 0.84375, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6016916036605835, + "sampling/importance_sampling_ratio/mean": 1.0005524158477783, + "sampling/importance_sampling_ratio/min": 0.6175822019577026, + "sampling/sampling_logp_difference/max": 0.48194313049316406, + "sampling/sampling_logp_difference/mean": 0.01181773655116558, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 161.578125, + "completions/mean_terminated_length": 161.578125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.19032913446426392, + "epoch": 2.079656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054749233067084666, + "kl": 0.08211810141801834, + "learning_rate": 2.6243859143987367e-07, + "loss": 0.0008, + "num_tokens": 53585268.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7697570323944092, + "sampling/importance_sampling_ratio/mean": 1.000309944152832, + "sampling/importance_sampling_ratio/min": 0.07049079239368439, + "sampling/sampling_logp_difference/max": 2.652273178100586, + "sampling/sampling_logp_difference/mean": 0.012656516395509243, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 254.015625, + "completions/mean_terminated_length": 254.015625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.27198344469070435, + "epoch": 2.0808823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0004974301028033, + "kl": 0.08722476661205292, + "learning_rate": 2.6181199635085616e-07, + "loss": 0.0209, + "num_tokens": 53616421.0, + "reward": 0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6436246633529663, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.4887487292289734, + "sampling/sampling_logp_difference/max": 0.7159067392349243, + "sampling/sampling_logp_difference/mean": 0.01370446290820837, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 277.28125, + "completions/mean_terminated_length": 277.28125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.2722983956336975, + "epoch": 2.082107843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1210881745154528, + "kl": 0.055352453142404556, + "learning_rate": 2.6118588476839607e-07, + "loss": -0.0103, + "num_tokens": 53651767.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.971760630607605, + "sampling/importance_sampling_ratio/mean": 0.9999873042106628, + "sampling/importance_sampling_ratio/min": 0.4567083418369293, + "sampling/sampling_logp_difference/max": 0.7837103605270386, + "sampling/sampling_logp_difference/mean": 0.014415323734283447, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 171.40625, + "completions/mean_terminated_length": 171.40625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.20028963685035706, + "epoch": 2.0833333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5168636163216445, + "kl": 0.06290790438652039, + "learning_rate": 2.6056025796346094e-07, + "loss": 0.0298, + "num_tokens": 53679313.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.7804113626480103, + "sampling/importance_sampling_ratio/mean": 0.9998703002929688, + "sampling/importance_sampling_ratio/min": 0.46356526017189026, + "sampling/sampling_logp_difference/max": 0.768808126449585, + "sampling/sampling_logp_difference/mean": 0.012557139620184898, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 224.484375, + "completions/mean_terminated_length": 224.484375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.26865464448928833, + "epoch": 2.0845588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.750853709796323, + "kl": 0.09681269526481628, + "learning_rate": 2.599351172060329e-07, + "loss": 0.0792, + "num_tokens": 53711472.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6248468160629272, + "sampling/importance_sampling_ratio/mean": 0.9993053078651428, + "sampling/importance_sampling_ratio/min": 0.6057427525520325, + "sampling/sampling_logp_difference/max": 0.5012998580932617, + "sampling/sampling_logp_difference/mean": 0.014584803953766823, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 219.296875, + "completions/mean_terminated_length": 219.296875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2701049745082855, + "epoch": 2.0857843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7294079164994691, + "kl": 0.09864313900470734, + "learning_rate": 2.593104637651087e-07, + "loss": 0.0051, + "num_tokens": 53745267.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6277406215667725, + "sampling/importance_sampling_ratio/mean": 0.9999047517776489, + "sampling/importance_sampling_ratio/min": 0.4596964120864868, + "sampling/sampling_logp_difference/max": 0.7771890163421631, + "sampling/sampling_logp_difference/mean": 0.013745477423071861, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 222.78125, + "completions/mean_terminated_length": 222.78125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.313871294260025, + "epoch": 2.0870098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2424166278195006, + "kl": 0.08946055173873901, + "learning_rate": 2.5868629890869463e-07, + "loss": -0.0132, + "num_tokens": 53777061.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995561838150024, + "sampling/importance_sampling_ratio/min": 0.5826087594032288, + "sampling/sampling_logp_difference/max": 0.7213950157165527, + "sampling/sampling_logp_difference/mean": 0.015899470075964928, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 241.5, + "completions/mean_terminated_length": 241.5, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.1832994967699051, + "epoch": 2.088235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9990023865572616, + "kl": 0.07222867012023926, + "learning_rate": 2.580626239038061e-07, + "loss": 0.0564, + "num_tokens": 53809813.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004302263259888, + "sampling/importance_sampling_ratio/min": 0.6222401857376099, + "sampling/sampling_logp_difference/max": 0.7347879409790039, + "sampling/sampling_logp_difference/mean": 0.010797273367643356, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 280.90625, + "completions/mean_terminated_length": 280.90625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.32643812894821167, + "epoch": 2.0894607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5066735371030042, + "kl": 0.0860418826341629, + "learning_rate": 2.5743944001646387e-07, + "loss": -0.1456, + "num_tokens": 53848527.0, + "reward": 0.15625, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000722885131836, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.7048373222351074, + "sampling/sampling_logp_difference/mean": 0.014995020814239979, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 188.625, + "completions/mean_terminated_length": 188.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.28493088483810425, + "epoch": 2.090686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.007547341251892, + "kl": 0.10750987380743027, + "learning_rate": 2.568167485116919e-07, + "loss": -0.0033, + "num_tokens": 53881191.0, + "reward": 0.8125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.8167446851730347, + "sampling/importance_sampling_ratio/mean": 1.0001343488693237, + "sampling/importance_sampling_ratio/min": 0.5686253905296326, + "sampling/sampling_logp_difference/max": 0.5970462560653687, + "sampling/sampling_logp_difference/mean": 0.015858035534620285, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 193.890625, + "completions/mean_terminated_length": 193.890625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.28339624404907227, + "epoch": 2.0919117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5254505440260446, + "kl": 0.1307823657989502, + "learning_rate": 2.5619455065351435e-07, + "loss": -0.1184, + "num_tokens": 53913744.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8837978839874268, + "sampling/importance_sampling_ratio/mean": 0.9992457628250122, + "sampling/importance_sampling_ratio/min": 0.6172720193862915, + "sampling/sampling_logp_difference/max": 0.6332898139953613, + "sampling/sampling_logp_difference/mean": 0.01515759713947773, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 184.515625, + "completions/mean_terminated_length": 184.515625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.16468603909015656, + "epoch": 2.093137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18154179047978547, + "kl": 0.09442128241062164, + "learning_rate": 2.555728477049532e-07, + "loss": 0.0009, + "num_tokens": 53940977.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5411758422851562, + "sampling/importance_sampling_ratio/mean": 1.0002518892288208, + "sampling/importance_sampling_ratio/min": 0.591300368309021, + "sampling/sampling_logp_difference/max": 0.5254311561584473, + "sampling/sampling_logp_difference/mean": 0.010360531508922577, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 199.453125, + "completions/mean_terminated_length": 199.453125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.20893830060958862, + "epoch": 2.094362745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6978978345600726, + "kl": 0.11723177134990692, + "learning_rate": 2.5495164092802646e-07, + "loss": -0.1173, + "num_tokens": 53973150.0, + "reward": 0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.969475507736206, + "sampling/importance_sampling_ratio/mean": 0.999012291431427, + "sampling/importance_sampling_ratio/min": 0.02537323720753193, + "sampling/sampling_logp_difference/max": 3.674060344696045, + "sampling/sampling_logp_difference/mean": 0.01393135730177164, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 228.5, + "completions/mean_terminated_length": 228.5, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.2282354086637497, + "epoch": 2.0955882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4792067205301653, + "kl": 0.07126940041780472, + "learning_rate": 2.5433093158374437e-07, + "loss": -0.059, + "num_tokens": 54005422.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.690609335899353, + "sampling/importance_sampling_ratio/mean": 0.9998007416725159, + "sampling/importance_sampling_ratio/min": 0.609043538570404, + "sampling/sampling_logp_difference/max": 0.5250890254974365, + "sampling/sampling_logp_difference/mean": 0.012127692811191082, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 294.09375, + "completions/mean_terminated_length": 294.09375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.268255352973938, + "epoch": 2.096813725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.564716344849188, + "kl": 0.07376987487077713, + "learning_rate": 2.537107209321074e-07, + "loss": -0.0202, + "num_tokens": 54044004.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6540254354476929, + "sampling/importance_sampling_ratio/mean": 1.0002803802490234, + "sampling/importance_sampling_ratio/min": 0.5292773246765137, + "sampling/sampling_logp_difference/max": 0.6362427473068237, + "sampling/sampling_logp_difference/mean": 0.013309784233570099, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 5000.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 195.9365234375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.24611814320087433, + "epoch": 2.0980392156862746, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6217309517852607, + "kl": 0.11633831262588501, + "learning_rate": 2.5309101023210424e-07, + "loss": 1.0697, + "num_tokens": 54077044.0, + "reward": 0.625, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6280970573425293, + "sampling/importance_sampling_ratio/mean": 0.9997564554214478, + "sampling/importance_sampling_ratio/min": 0.5820223689079285, + "sampling/sampling_logp_difference/max": 0.5412464141845703, + "sampling/sampling_logp_difference/mean": 0.012156832963228226, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 245.375, + "completions/mean_terminated_length": 245.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2816395163536072, + "epoch": 2.099264705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0319645059625984, + "kl": 0.07405499368906021, + "learning_rate": 2.524718007417081e-07, + "loss": 0.0138, + "num_tokens": 54109964.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002710819244385, + "sampling/importance_sampling_ratio/min": 0.5820378065109253, + "sampling/sampling_logp_difference/max": 0.7261428833007812, + "sampling/sampling_logp_difference/mean": 0.014796335250139236, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 251.6875, + "completions/mean_terminated_length": 251.6875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.27752459049224854, + "epoch": 2.1004901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4228132897805845, + "kl": 0.08228473365306854, + "learning_rate": 2.518530937178751e-07, + "loss": -0.0115, + "num_tokens": 54147112.0, + "reward": 0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6428769826889038, + "sampling/importance_sampling_ratio/mean": 0.9996975660324097, + "sampling/importance_sampling_ratio/min": 0.524318516254425, + "sampling/sampling_logp_difference/max": 0.6456558704376221, + "sampling/sampling_logp_difference/mean": 0.013332992792129517, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 206.390625, + "completions/mean_terminated_length": 206.390625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3341542184352875, + "epoch": 2.1017156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05268439635242992, + "kl": 0.10409331321716309, + "learning_rate": 2.512348904165411e-07, + "loss": 0.001, + "num_tokens": 54178337.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6648049354553223, + "sampling/importance_sampling_ratio/mean": 0.9995940923690796, + "sampling/importance_sampling_ratio/min": 0.4500335156917572, + "sampling/sampling_logp_difference/max": 0.7984331846237183, + "sampling/sampling_logp_difference/mean": 0.015868140384554863, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 183.515625, + "completions/mean_terminated_length": 183.515625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.18172964453697205, + "epoch": 2.1029411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38210724360668885, + "kl": 0.06356281787157059, + "learning_rate": 2.5061719209262e-07, + "loss": 0.0006, + "num_tokens": 54204898.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4399677515029907, + "sampling/importance_sampling_ratio/mean": 1.0001122951507568, + "sampling/importance_sampling_ratio/min": 0.09123151749372482, + "sampling/sampling_logp_difference/max": 2.394354820251465, + "sampling/sampling_logp_difference/mean": 0.011870104819536209, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 199.796875, + "completions/mean_terminated_length": 199.796875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.2558775544166565, + "epoch": 2.1041666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0363360716687429, + "kl": 0.08049780875444412, + "learning_rate": 2.500000000000001e-07, + "loss": -0.0034, + "num_tokens": 54237365.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6464908123016357, + "sampling/importance_sampling_ratio/mean": 1.0003085136413574, + "sampling/importance_sampling_ratio/min": 0.5331515073776245, + "sampling/sampling_logp_difference/max": 0.6289496421813965, + "sampling/sampling_logp_difference/mean": 0.015734516084194183, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 229.03125, + "completions/mean_terminated_length": 229.03125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.1508364975452423, + "epoch": 2.105392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03236801132415331, + "kl": 0.05850634723901749, + "learning_rate": 2.49383315391542e-07, + "loss": 0.0005, + "num_tokens": 54265927.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6401046514511108, + "sampling/importance_sampling_ratio/mean": 0.9999222159385681, + "sampling/importance_sampling_ratio/min": 0.6105112433433533, + "sampling/sampling_logp_difference/max": 0.49476003646850586, + "sampling/sampling_logp_difference/mean": 0.010126762092113495, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 183.609375, + "completions/mean_terminated_length": 183.609375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.20529486238956451, + "epoch": 2.1066176470588234, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9023498920204884, + "kl": 0.08519157767295837, + "learning_rate": 2.4876713951907685e-07, + "loss": -0.0059, + "num_tokens": 54293726.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994800686836243, + "sampling/importance_sampling_ratio/min": 0.5265898108482361, + "sampling/sampling_logp_difference/max": 0.7503294944763184, + "sampling/sampling_logp_difference/mean": 0.0120610436424613, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 247.140625, + "completions/mean_terminated_length": 247.140625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.22897523641586304, + "epoch": 2.107843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3870651483158327, + "kl": 0.1443021297454834, + "learning_rate": 2.481514736334022e-07, + "loss": 0.0068, + "num_tokens": 54323863.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004396438598633, + "sampling/importance_sampling_ratio/min": 0.608461320400238, + "sampling/sampling_logp_difference/max": 0.7307519912719727, + "sampling/sampling_logp_difference/mean": 0.011523693799972534, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 226.140625, + "completions/mean_terminated_length": 226.140625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.26521825790405273, + "epoch": 2.1090686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1330318041901481, + "kl": 0.08887003362178802, + "learning_rate": 2.4753631898428134e-07, + "loss": 0.0009, + "num_tokens": 54358816.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6283246278762817, + "sampling/importance_sampling_ratio/mean": 0.999549925327301, + "sampling/importance_sampling_ratio/min": 0.5542429089546204, + "sampling/sampling_logp_difference/max": 0.5901522636413574, + "sampling/sampling_logp_difference/mean": 0.014785770326852798, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 241.796875, + "completions/mean_terminated_length": 241.796875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.3558571934700012, + "epoch": 2.110294117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6903484152336, + "kl": 0.1265546679496765, + "learning_rate": 2.4692167682043853e-07, + "loss": -0.0268, + "num_tokens": 54404451.0, + "reward": -0.21875, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.9876798391342163, + "sampling/importance_sampling_ratio/mean": 1.0000967979431152, + "sampling/importance_sampling_ratio/min": 0.24371837079524994, + "sampling/sampling_logp_difference/max": 1.4117419719696045, + "sampling/sampling_logp_difference/mean": 0.017827820032835007, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 256.8125, + "completions/mean_terminated_length": 256.8125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3483244478702545, + "epoch": 2.111519607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9842230917924334, + "kl": 0.1271551251411438, + "learning_rate": 2.4630754838955896e-07, + "loss": -0.0121, + "num_tokens": 54437655.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999490976333618, + "sampling/importance_sampling_ratio/min": 0.5489413738250732, + "sampling/sampling_logp_difference/max": 0.771472692489624, + "sampling/sampling_logp_difference/mean": 0.017512347549200058, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 216.328125, + "completions/mean_terminated_length": 216.328125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.30079299211502075, + "epoch": 2.1127450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.027835491538995, + "kl": 0.1183740645647049, + "learning_rate": 2.456939349382843e-07, + "loss": -0.0364, + "num_tokens": 54470316.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994838833808899, + "sampling/importance_sampling_ratio/min": 0.3653780519962311, + "sampling/sampling_logp_difference/max": 1.0068227052688599, + "sampling/sampling_logp_difference/mean": 0.015461128205060959, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 143.5, + "completions/mean_terminated_length": 143.5, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.22464868426322937, + "epoch": 2.113970588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0455863379671696, + "kl": 0.08102913200855255, + "learning_rate": 2.450808377122107e-07, + "loss": 0.0008, + "num_tokens": 54497180.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005425214767456, + "sampling/importance_sampling_ratio/min": 0.6485685110092163, + "sampling/sampling_logp_difference/max": 0.7369003295898438, + "sampling/sampling_logp_difference/mean": 0.012720860540866852, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 178.6875, + "completions/mean_terminated_length": 178.6875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.19968390464782715, + "epoch": 2.1151960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0752407669557944, + "kl": 0.0696045309305191, + "learning_rate": 2.4446825795588716e-07, + "loss": 0.0007, + "num_tokens": 54528104.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5081994533538818, + "sampling/importance_sampling_ratio/mean": 1.0000910758972168, + "sampling/importance_sampling_ratio/min": 0.6319701671600342, + "sampling/sampling_logp_difference/max": 0.45891308784484863, + "sampling/sampling_logp_difference/mean": 0.011953980661928654, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 230.3125, + "completions/mean_terminated_length": 230.3125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.21571284532546997, + "epoch": 2.116421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1012846289450315, + "kl": 0.06651733815670013, + "learning_rate": 2.438561969128114e-07, + "loss": 0.0242, + "num_tokens": 54562956.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6443967819213867, + "sampling/importance_sampling_ratio/mean": 1.0001633167266846, + "sampling/importance_sampling_ratio/min": 0.44692784547805786, + "sampling/sampling_logp_difference/max": 0.8053581714630127, + "sampling/sampling_logp_difference/mean": 0.011286087334156036, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 164.6875, + "completions/mean_terminated_length": 164.6875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.24942715466022491, + "epoch": 2.1176470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5723757822993916, + "kl": 0.09061464667320251, + "learning_rate": 2.43244655825429e-07, + "loss": -0.0274, + "num_tokens": 54587672.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.609130859375, + "sampling/importance_sampling_ratio/mean": 0.9997722506523132, + "sampling/importance_sampling_ratio/min": 0.5053581595420837, + "sampling/sampling_logp_difference/max": 0.682487964630127, + "sampling/sampling_logp_difference/mean": 0.013893929310142994, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 173.90625, + "completions/mean_terminated_length": 173.90625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.15339696407318115, + "epoch": 2.1188725490196076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04066181976972253, + "kl": 0.04262594133615494, + "learning_rate": 2.4263363593512903e-07, + "loss": 0.0004, + "num_tokens": 54613538.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000312328338623, + "sampling/importance_sampling_ratio/min": 0.4290059804916382, + "sampling/sampling_logp_difference/max": 0.8462843894958496, + "sampling/sampling_logp_difference/mean": 0.009953014552593231, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 161.15625, + "completions/mean_terminated_length": 161.15625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.22494977712631226, + "epoch": 2.1200980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0627337312082299, + "kl": 0.1084507554769516, + "learning_rate": 2.4202313848224364e-07, + "loss": 0.0011, + "num_tokens": 54641212.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6083506345748901, + "sampling/importance_sampling_ratio/mean": 1.0007972717285156, + "sampling/importance_sampling_ratio/min": 0.6186642050743103, + "sampling/sampling_logp_difference/max": 0.4801926612854004, + "sampling/sampling_logp_difference/mean": 0.013805609196424484, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 271.921875, + "completions/mean_terminated_length": 271.921875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3591475486755371, + "epoch": 2.1213235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9981361482983514, + "kl": 0.09604911506175995, + "learning_rate": 2.414131647060436e-07, + "loss": 0.0004, + "num_tokens": 54684295.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7665029764175415, + "sampling/importance_sampling_ratio/mean": 0.9997198581695557, + "sampling/importance_sampling_ratio/min": 0.47804930806159973, + "sampling/sampling_logp_difference/max": 0.7380414009094238, + "sampling/sampling_logp_difference/mean": 0.0156893078237772, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 178.03125, + "completions/mean_terminated_length": 178.03125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.17584030330181122, + "epoch": 2.122549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04942371544216179, + "kl": 0.0640132874250412, + "learning_rate": 2.4080371584473745e-07, + "loss": 0.0007, + "num_tokens": 54711561.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6530674695968628, + "sampling/importance_sampling_ratio/mean": 0.9999580979347229, + "sampling/importance_sampling_ratio/min": 0.5910108685493469, + "sampling/sampling_logp_difference/max": 0.5259209275245667, + "sampling/sampling_logp_difference/mean": 0.013702481985092163, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 263.546875, + "completions/mean_terminated_length": 263.546875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.23605573177337646, + "epoch": 2.123774509803922, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5635733484446757, + "kl": 0.07343361526727676, + "learning_rate": 2.4019479313546757e-07, + "loss": -0.0533, + "num_tokens": 54752972.0, + "reward": -0.125, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.9693632125854492, + "sampling/importance_sampling_ratio/mean": 0.9998065233230591, + "sampling/importance_sampling_ratio/min": 0.477456659078598, + "sampling/sampling_logp_difference/max": 0.7392818927764893, + "sampling/sampling_logp_difference/mean": 0.01350579783320427, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 200.71875, + "completions/mean_terminated_length": 200.71875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.35887908935546875, + "epoch": 2.125, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0794125377573662, + "kl": 0.13092204928398132, + "learning_rate": 2.395863978143083e-07, + "loss": -0.0382, + "num_tokens": 54791514.0, + "reward": -0.0625, + "reward_std": 0.6285127401351929, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7535253763198853, + "sampling/importance_sampling_ratio/mean": 0.9996278285980225, + "sampling/importance_sampling_ratio/min": 0.6149638891220093, + "sampling/sampling_logp_difference/max": 0.5616282224655151, + "sampling/sampling_logp_difference/mean": 0.01837782934308052, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 278.703125, + "completions/mean_terminated_length": 278.703125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.22424054145812988, + "epoch": 2.126225490196078, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4767677478882237, + "kl": 0.06706281751394272, + "learning_rate": 2.3897853111626417e-07, + "loss": -0.0496, + "num_tokens": 54829255.0, + "reward": 0.1875, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.598141074180603, + "sampling/importance_sampling_ratio/mean": 1.000441312789917, + "sampling/importance_sampling_ratio/min": 0.3738076984882355, + "sampling/sampling_logp_difference/max": 0.9840137958526611, + "sampling/sampling_logp_difference/mean": 0.013220787979662418, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 163.3125, + "completions/mean_terminated_length": 163.3125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.24036374688148499, + "epoch": 2.127450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2965382984415355, + "kl": 0.08836943656206131, + "learning_rate": 2.383711942752652e-07, + "loss": -0.0056, + "num_tokens": 54858171.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5497233867645264, + "sampling/importance_sampling_ratio/mean": 0.9995202422142029, + "sampling/importance_sampling_ratio/min": 0.4811636507511139, + "sampling/sampling_logp_difference/max": 0.7315478324890137, + "sampling/sampling_logp_difference/mean": 0.014389409683644772, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 215.796875, + "completions/mean_terminated_length": 215.796875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.32177525758743286, + "epoch": 2.1286764705882355, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5956116910340583, + "kl": 0.09418761730194092, + "learning_rate": 2.377643885241674e-07, + "loss": 0.0455, + "num_tokens": 54895390.0, + "reward": 0.46875, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5071581602096558, + "sampling/importance_sampling_ratio/mean": 1.0002448558807373, + "sampling/importance_sampling_ratio/min": 0.5393481254577637, + "sampling/sampling_logp_difference/max": 0.617393970489502, + "sampling/sampling_logp_difference/mean": 0.015557506121695042, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 201.078125, + "completions/mean_terminated_length": 201.078125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3626950979232788, + "epoch": 2.1299019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8998408094699397, + "kl": 0.09067320823669434, + "learning_rate": 2.371581150947476e-07, + "loss": 0.0716, + "num_tokens": 54926707.0, + "reward": 0.75, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.7580565214157104, + "sampling/importance_sampling_ratio/mean": 0.99964439868927, + "sampling/importance_sampling_ratio/min": 0.5687322020530701, + "sampling/sampling_logp_difference/max": 0.5643455982208252, + "sampling/sampling_logp_difference/mean": 0.01802055910229683, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 256.921875, + "completions/mean_terminated_length": 256.921875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.24795296788215637, + "epoch": 2.1311274509803924, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9659707270123654, + "kl": 0.08193226158618927, + "learning_rate": 2.3655237521770282e-07, + "loss": -0.0085, + "num_tokens": 54963086.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5624288320541382, + "sampling/importance_sampling_ratio/mean": 0.9994508624076843, + "sampling/importance_sampling_ratio/min": 0.3866180181503296, + "sampling/sampling_logp_difference/max": 0.9503180980682373, + "sampling/sampling_logp_difference/mean": 0.012907266616821289, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 193.703125, + "completions/mean_terminated_length": 193.703125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2887061536312103, + "epoch": 2.1323529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4869115641371657, + "kl": 0.10755470395088196, + "learning_rate": 2.3594717012264642e-07, + "loss": -0.0437, + "num_tokens": 54995083.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.650195598602295, + "sampling/importance_sampling_ratio/mean": 0.9998593926429749, + "sampling/importance_sampling_ratio/min": 0.620280385017395, + "sampling/sampling_logp_difference/max": 0.5008938312530518, + "sampling/sampling_logp_difference/mean": 0.015658602118492126, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 243.109375, + "completions/mean_terminated_length": 243.109375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3275303840637207, + "epoch": 2.133578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.712720509641171, + "kl": 0.11412831395864487, + "learning_rate": 2.3534250103810627e-07, + "loss": 0.0513, + "num_tokens": 55029090.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6466587781906128, + "sampling/importance_sampling_ratio/mean": 0.9996645450592041, + "sampling/importance_sampling_ratio/min": 0.37343189120292664, + "sampling/sampling_logp_difference/max": 0.9850196838378906, + "sampling/sampling_logp_difference/mean": 0.015656957402825356, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 278.390625, + "completions/mean_terminated_length": 278.390625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3630286455154419, + "epoch": 2.1348039215686274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8417698342017348, + "kl": 0.11501502245664597, + "learning_rate": 2.3473836919152263e-07, + "loss": 0.1042, + "num_tokens": 55067387.0, + "reward": 0.46875, + "reward_std": 0.8723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.592509150505066, + "sampling/importance_sampling_ratio/mean": 1.0000813007354736, + "sampling/importance_sampling_ratio/min": 0.4159523844718933, + "sampling/sampling_logp_difference/max": 0.8771845102310181, + "sampling/sampling_logp_difference/mean": 0.01706148311495781, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 215.609375, + "completions/mean_terminated_length": 215.609375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.28509455919265747, + "epoch": 2.136029411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.79398630714614, + "kl": 0.11121399700641632, + "learning_rate": 2.3413477580924475e-07, + "loss": 0.074, + "num_tokens": 55098930.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.8181973695755005, + "sampling/importance_sampling_ratio/mean": 0.999822199344635, + "sampling/importance_sampling_ratio/min": 0.3234425485134125, + "sampling/sampling_logp_difference/max": 1.12873375415802, + "sampling/sampling_logp_difference/mean": 0.014676484279334545, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 226.65625, + "completions/mean_terminated_length": 226.65625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.20623424649238586, + "epoch": 2.1372549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04135339761860059, + "kl": 0.0599723644554615, + "learning_rate": 2.3353172211652884e-07, + "loss": 0.0006, + "num_tokens": 55134700.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999659657478333, + "sampling/importance_sampling_ratio/min": 0.6207950115203857, + "sampling/sampling_logp_difference/max": 0.9944319725036621, + "sampling/sampling_logp_difference/mean": 0.011734064668416977, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 188.1875, + "completions/mean_terminated_length": 188.1875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2525515556335449, + "epoch": 2.138480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4233141311733437, + "kl": 0.09250284731388092, + "learning_rate": 2.329292093375356e-07, + "loss": 0.0167, + "num_tokens": 55162600.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7998061180114746, + "sampling/importance_sampling_ratio/mean": 0.9999781847000122, + "sampling/importance_sampling_ratio/min": 0.27488502860069275, + "sampling/sampling_logp_difference/max": 1.2914023399353027, + "sampling/sampling_logp_difference/mean": 0.014600848779082298, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 193.71875, + "completions/mean_terminated_length": 193.71875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.29777389764785767, + "epoch": 2.139705882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7411493991118427, + "kl": 0.12776996195316315, + "learning_rate": 2.3232723869532816e-07, + "loss": -0.0418, + "num_tokens": 55193926.0, + "reward": 0.3125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997895956039429, + "sampling/importance_sampling_ratio/min": 0.3743273615837097, + "sampling/sampling_logp_difference/max": 1.0381970405578613, + "sampling/sampling_logp_difference/mean": 0.01634758710861206, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 210.3125, + "completions/mean_terminated_length": 210.3125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.30516448616981506, + "epoch": 2.1409313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1469881691496775, + "kl": 0.1371455043554306, + "learning_rate": 2.3172581141186858e-07, + "loss": -0.0006, + "num_tokens": 55222106.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5972622632980347, + "sampling/importance_sampling_ratio/mean": 1.000130534172058, + "sampling/importance_sampling_ratio/min": 0.6546982526779175, + "sampling/sampling_logp_difference/max": 0.4682910442352295, + "sampling/sampling_logp_difference/mean": 0.015357891097664833, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 190.296875, + "completions/mean_terminated_length": 190.296875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2997080683708191, + "epoch": 2.142156862745098, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.392021870202034, + "kl": 0.11923129856586456, + "learning_rate": 2.3112492870801602e-07, + "loss": -0.0323, + "num_tokens": 55254797.0, + "reward": 0.46875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8146553039550781, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.3732207417488098, + "sampling/sampling_logp_difference/max": 0.9855852127075195, + "sampling/sampling_logp_difference/mean": 0.01580945774912834, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 194.796875, + "completions/mean_terminated_length": 194.796875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.22310101985931396, + "epoch": 2.1433823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.40417591739693, + "kl": 0.07776062190532684, + "learning_rate": 2.3052459180352458e-07, + "loss": 0.052, + "num_tokens": 55286880.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.612930417060852, + "sampling/importance_sampling_ratio/mean": 1.000563621520996, + "sampling/importance_sampling_ratio/min": 0.628268301486969, + "sampling/sampling_logp_difference/max": 0.47805261611938477, + "sampling/sampling_logp_difference/mean": 0.013772143051028252, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 222.90625, + "completions/mean_terminated_length": 222.90625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2597554326057434, + "epoch": 2.144607843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8060586946344956, + "kl": 0.08378937095403671, + "learning_rate": 2.2992480191704e-07, + "loss": -0.0062, + "num_tokens": 55325754.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001883506774902, + "sampling/importance_sampling_ratio/min": 0.5075283646583557, + "sampling/sampling_logp_difference/max": 0.7715139389038086, + "sampling/sampling_logp_difference/mean": 0.014584079384803772, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 189.9375, + "completions/mean_terminated_length": 189.9375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.22471821308135986, + "epoch": 2.1458333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.637028134535316, + "kl": 0.12206414341926575, + "learning_rate": 2.2932556026609777e-07, + "loss": 0.0076, + "num_tokens": 55359654.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6438500881195068, + "sampling/importance_sampling_ratio/mean": 1.0002222061157227, + "sampling/importance_sampling_ratio/min": 0.4182717204093933, + "sampling/sampling_logp_difference/max": 0.8716239929199219, + "sampling/sampling_logp_difference/mean": 0.013608250766992569, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 222.109375, + "completions/mean_terminated_length": 222.109375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2933546304702759, + "epoch": 2.1470588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6332167204363761, + "kl": 0.08434764295816422, + "learning_rate": 2.2872686806712032e-07, + "loss": 0.0377, + "num_tokens": 55396189.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.612004280090332, + "sampling/importance_sampling_ratio/mean": 0.9994946718215942, + "sampling/importance_sampling_ratio/min": 0.5971236824989319, + "sampling/sampling_logp_difference/max": 0.5156309604644775, + "sampling/sampling_logp_difference/mean": 0.015469004400074482, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 191.203125, + "completions/mean_terminated_length": 191.203125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.31936416029930115, + "epoch": 2.1482843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5481373616086007, + "kl": 0.09290601313114166, + "learning_rate": 2.2812872653541498e-07, + "loss": -0.0505, + "num_tokens": 55432778.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003182888031006, + "sampling/importance_sampling_ratio/min": 0.3702169358730316, + "sampling/sampling_logp_difference/max": 2.06929612159729, + "sampling/sampling_logp_difference/mean": 0.018548715859651566, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 187.484375, + "completions/mean_terminated_length": 187.484375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.29913410544395447, + "epoch": 2.1495098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.4493544700953307, + "kl": 0.11844295263290405, + "learning_rate": 2.2753113688517155e-07, + "loss": -0.0129, + "num_tokens": 55468217.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.9188014268875122, + "sampling/importance_sampling_ratio/mean": 1.0003360509872437, + "sampling/importance_sampling_ratio/min": 0.5287709832191467, + "sampling/sampling_logp_difference/max": 0.6517007350921631, + "sampling/sampling_logp_difference/mean": 0.015478193759918213, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 212.59375, + "completions/mean_terminated_length": 212.59375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3491870164871216, + "epoch": 2.150735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.395665365436369, + "kl": 0.10798598825931549, + "learning_rate": 2.2693410032945853e-07, + "loss": -0.0155, + "num_tokens": 55504319.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6156541109085083, + "sampling/importance_sampling_ratio/mean": 1.0005502700805664, + "sampling/importance_sampling_ratio/min": 0.603910505771637, + "sampling/sampling_logp_difference/max": 0.5043292045593262, + "sampling/sampling_logp_difference/mean": 0.017521340399980545, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 197.84375, + "completions/mean_terminated_length": 197.84375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.26326245069503784, + "epoch": 2.1519607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9120040970065213, + "kl": 0.09008113294839859, + "learning_rate": 2.2633761808022272e-07, + "loss": -0.0423, + "num_tokens": 55537045.0, + "reward": 0.46875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997286200523376, + "sampling/importance_sampling_ratio/min": 0.5443238019943237, + "sampling/sampling_logp_difference/max": 0.8299136161804199, + "sampling/sampling_logp_difference/mean": 0.014764741994440556, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 181.09375, + "completions/mean_terminated_length": 181.09375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.24075452983379364, + "epoch": 2.153186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2333513103896843, + "kl": 0.12325990200042725, + "learning_rate": 2.2574169134828526e-07, + "loss": 0.0104, + "num_tokens": 55563707.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6210919618606567, + "sampling/importance_sampling_ratio/mean": 1.0004346370697021, + "sampling/importance_sampling_ratio/min": 0.5685502290725708, + "sampling/sampling_logp_difference/max": 0.5646655559539795, + "sampling/sampling_logp_difference/mean": 0.013588340021669865, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 222.734375, + "completions/mean_terminated_length": 222.734375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.2973715364933014, + "epoch": 2.1544117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05236871317810249, + "kl": 0.11249151825904846, + "learning_rate": 2.2514632134333932e-07, + "loss": 0.0011, + "num_tokens": 55595642.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6206932067871094, + "sampling/importance_sampling_ratio/mean": 1.000004529953003, + "sampling/importance_sampling_ratio/min": 0.4218224287033081, + "sampling/sampling_logp_difference/max": 0.863170862197876, + "sampling/sampling_logp_difference/mean": 0.01540279109030962, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 216.3125, + "completions/mean_terminated_length": 216.3125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.24999091029167175, + "epoch": 2.155637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4996968531414558, + "kl": 0.07931873947381973, + "learning_rate": 2.2455150927394878e-07, + "loss": 0.0293, + "num_tokens": 55627822.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.872873306274414, + "sampling/importance_sampling_ratio/mean": 0.9997075200080872, + "sampling/importance_sampling_ratio/min": 0.5429760813713074, + "sampling/sampling_logp_difference/max": 0.6274738311767578, + "sampling/sampling_logp_difference/mean": 0.013102727010846138, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 194.703125, + "completions/mean_terminated_length": 194.703125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.28348392248153687, + "epoch": 2.156862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1725029071104536, + "kl": 0.09887401759624481, + "learning_rate": 2.2395725634754402e-07, + "loss": -0.0066, + "num_tokens": 55659995.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5785914659500122, + "sampling/importance_sampling_ratio/mean": 1.00052809715271, + "sampling/importance_sampling_ratio/min": 0.6561868190765381, + "sampling/sampling_logp_difference/max": 0.45653295516967773, + "sampling/sampling_logp_difference/mean": 0.014698462560772896, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 241.71875, + "completions/mean_terminated_length": 241.71875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.19589178264141083, + "epoch": 2.1580882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1591875727037666, + "kl": 0.05602003633975983, + "learning_rate": 2.2336356377042143e-07, + "loss": 0.0176, + "num_tokens": 55690489.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5980969667434692, + "sampling/importance_sampling_ratio/mean": 0.9999008178710938, + "sampling/importance_sampling_ratio/min": 0.40407630801200867, + "sampling/sampling_logp_difference/max": 0.906151533126831, + "sampling/sampling_logp_difference/mean": 0.011489280499517918, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 159.796875, + "completions/mean_terminated_length": 159.796875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.21925659477710724, + "epoch": 2.159313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09653099240163052, + "kl": 0.08598222583532333, + "learning_rate": 2.2277043274773854e-07, + "loss": 0.0008, + "num_tokens": 55719084.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.790588140487671, + "sampling/importance_sampling_ratio/mean": 1.0003536939620972, + "sampling/importance_sampling_ratio/min": 0.6108417510986328, + "sampling/sampling_logp_difference/max": 0.5825440883636475, + "sampling/sampling_logp_difference/mean": 0.014810223132371902, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 206.28125, + "completions/mean_terminated_length": 206.28125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.23432119190692902, + "epoch": 2.1605392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1632752163632758, + "kl": 0.05296270549297333, + "learning_rate": 2.221778644835144e-07, + "loss": -0.0043, + "num_tokens": 55748526.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.944764494895935, + "sampling/importance_sampling_ratio/mean": 0.9997674822807312, + "sampling/importance_sampling_ratio/min": 0.44626033306121826, + "sampling/sampling_logp_difference/max": 0.8068528175354004, + "sampling/sampling_logp_difference/mean": 0.0140206478536129, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 183.28125, + "completions/mean_terminated_length": 183.28125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.22710749506950378, + "epoch": 2.161764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0621859740927757, + "kl": 0.07478615641593933, + "learning_rate": 2.215858601806246e-07, + "loss": 0.0007, + "num_tokens": 55775776.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999275207519531, + "sampling/importance_sampling_ratio/min": 0.5677725672721863, + "sampling/sampling_logp_difference/max": 0.8884508609771729, + "sampling/sampling_logp_difference/mean": 0.01352921687066555, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 162.609375, + "completions/mean_terminated_length": 162.609375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.2223511040210724, + "epoch": 2.1629901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.048509211810651, + "kl": 0.0845133364200592, + "learning_rate": 2.2099442104080075e-07, + "loss": -0.0478, + "num_tokens": 55800503.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5168209075927734, + "sampling/importance_sampling_ratio/mean": 1.0006378889083862, + "sampling/importance_sampling_ratio/min": 0.5227040648460388, + "sampling/sampling_logp_difference/max": 0.6487398147583008, + "sampling/sampling_logp_difference/mean": 0.012828944250941277, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 221.328125, + "completions/mean_terminated_length": 221.328125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2978760004043579, + "epoch": 2.1642156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0565074687144558, + "kl": 0.10850057005882263, + "learning_rate": 2.2040354826462664e-07, + "loss": -0.0032, + "num_tokens": 55835996.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.9099600315093994, + "sampling/importance_sampling_ratio/mean": 0.9992789030075073, + "sampling/importance_sampling_ratio/min": 0.28735923767089844, + "sampling/sampling_logp_difference/max": 1.2470221519470215, + "sampling/sampling_logp_difference/mean": 0.01584113948047161, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 155.53125, + "completions/mean_terminated_length": 155.53125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.17103657126426697, + "epoch": 2.1654411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5059775577140329, + "kl": 0.06305205821990967, + "learning_rate": 2.1981324305153642e-07, + "loss": 0.0093, + "num_tokens": 55861022.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9056557416915894, + "sampling/importance_sampling_ratio/mean": 0.9994888305664062, + "sampling/importance_sampling_ratio/min": 0.5924586057662964, + "sampling/sampling_logp_difference/max": 0.6448261737823486, + "sampling/sampling_logp_difference/mean": 0.012093236669898033, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 233.390625, + "completions/mean_terminated_length": 233.390625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.3735348582267761, + "epoch": 2.1666666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4999732441846727, + "kl": 0.14664384722709656, + "learning_rate": 2.192235065998126e-07, + "loss": 0.0184, + "num_tokens": 55894487.0, + "reward": 0.03125, + "reward_std": 0.7744960784912109, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9880139827728271, + "sampling/importance_sampling_ratio/mean": 0.9999355673789978, + "sampling/importance_sampling_ratio/min": 0.45969530940055847, + "sampling/sampling_logp_difference/max": 0.7771914005279541, + "sampling/sampling_logp_difference/mean": 0.0183719452470541, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 194.4375, + "completions/mean_terminated_length": 194.4375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.22018903493881226, + "epoch": 2.167892156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0396575520521913, + "kl": 0.05755123496055603, + "learning_rate": 2.1863434010658272e-07, + "loss": 0.0006, + "num_tokens": 55925059.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6318347454071045, + "sampling/importance_sampling_ratio/mean": 1.0003325939178467, + "sampling/importance_sampling_ratio/min": 0.5893513560295105, + "sampling/sampling_logp_difference/max": 0.5287327766418457, + "sampling/sampling_logp_difference/mean": 0.013786889612674713, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 193.703125, + "completions/mean_terminated_length": 193.703125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2954903841018677, + "epoch": 2.1691176470588234, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.140746619090501, + "kl": 0.1304420530796051, + "learning_rate": 2.1804574476781733e-07, + "loss": 0.0963, + "num_tokens": 55953584.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.7958390712738037, + "sampling/importance_sampling_ratio/mean": 1.0007104873657227, + "sampling/importance_sampling_ratio/min": 0.27278798818588257, + "sampling/sampling_logp_difference/max": 1.2990604639053345, + "sampling/sampling_logp_difference/mean": 0.015290407463908195, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 164.6875, + "completions/mean_terminated_length": 164.6875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.17861588299274445, + "epoch": 2.170343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6114393341212648, + "kl": 0.10109886527061462, + "learning_rate": 2.1745772177832755e-07, + "loss": 0.001, + "num_tokens": 55983580.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6309486627578735, + "sampling/importance_sampling_ratio/mean": 0.9999058246612549, + "sampling/importance_sampling_ratio/min": 0.5794409513473511, + "sampling/sampling_logp_difference/max": 0.5456914901733398, + "sampling/sampling_logp_difference/mean": 0.010914549231529236, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 181.671875, + "completions/mean_terminated_length": 181.671875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.19727925956249237, + "epoch": 2.1715686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04535562195449047, + "kl": 0.05515056103467941, + "learning_rate": 2.1687027233176318e-07, + "loss": 0.0005, + "num_tokens": 56009639.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8589531183242798, + "sampling/importance_sampling_ratio/mean": 1.0001940727233887, + "sampling/importance_sampling_ratio/min": 0.44680851697921753, + "sampling/sampling_logp_difference/max": 0.8056252002716064, + "sampling/sampling_logp_difference/mean": 0.013086721301078796, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 222.03125, + "completions/mean_terminated_length": 222.03125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.23033404350280762, + "epoch": 2.172794117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3009567177394339, + "kl": 0.09654970467090607, + "learning_rate": 2.1628339762060914e-07, + "loss": -0.112, + "num_tokens": 56043625.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6859341859817505, + "sampling/importance_sampling_ratio/mean": 1.0001462697982788, + "sampling/importance_sampling_ratio/min": 0.6387834548950195, + "sampling/sampling_logp_difference/max": 0.5223197937011719, + "sampling/sampling_logp_difference/mean": 0.01319871935993433, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 164.546875, + "completions/mean_terminated_length": 164.546875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.24296662211418152, + "epoch": 2.174019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1667428988558988, + "kl": 0.061865609139204025, + "learning_rate": 2.1569709883618382e-07, + "loss": -0.0109, + "num_tokens": 56077308.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6617133617401123, + "sampling/importance_sampling_ratio/mean": 1.000140905380249, + "sampling/importance_sampling_ratio/min": 0.44791504740715027, + "sampling/sampling_logp_difference/max": 0.8031517267227173, + "sampling/sampling_logp_difference/mean": 0.01590428128838539, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 246.875, + "completions/mean_terminated_length": 246.875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.2676558494567871, + "epoch": 2.1752450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1496004248726603, + "kl": 0.06484948098659515, + "learning_rate": 2.1511137716863687e-07, + "loss": 0.0323, + "num_tokens": 56114980.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.8461192846298218, + "sampling/importance_sampling_ratio/mean": 1.000048279762268, + "sampling/importance_sampling_ratio/min": 0.33948543667793274, + "sampling/sampling_logp_difference/max": 1.0803241729736328, + "sampling/sampling_logp_difference/mean": 0.013982707634568214, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 213.734375, + "completions/mean_terminated_length": 213.734375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.2765084505081177, + "epoch": 2.176470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.227470529989764, + "kl": 0.14539030194282532, + "learning_rate": 2.1452623380694602e-07, + "loss": 0.0079, + "num_tokens": 56146003.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6098580360412598, + "sampling/importance_sampling_ratio/mean": 0.9999431371688843, + "sampling/importance_sampling_ratio/min": 0.4871361553668976, + "sampling/sampling_logp_difference/max": 0.7192115783691406, + "sampling/sampling_logp_difference/mean": 0.014973534271121025, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 245.515625, + "completions/mean_terminated_length": 245.515625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.24445977807044983, + "epoch": 2.1776960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.574898244112307, + "kl": 0.10070991516113281, + "learning_rate": 2.1394166993891526e-07, + "loss": 0.0018, + "num_tokens": 56184308.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995533227920532, + "sampling/importance_sampling_ratio/min": 0.38582128286361694, + "sampling/sampling_logp_difference/max": 2.2083592414855957, + "sampling/sampling_logp_difference/mean": 0.013410156592726707, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 216.984375, + "completions/mean_terminated_length": 216.984375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2757517099380493, + "epoch": 2.178921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.272832905951485, + "kl": 0.09829697012901306, + "learning_rate": 2.1335768675117205e-07, + "loss": 0.0277, + "num_tokens": 56217827.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6953853368759155, + "sampling/importance_sampling_ratio/mean": 0.9995964765548706, + "sampling/importance_sampling_ratio/min": 0.3189980685710907, + "sampling/sampling_logp_difference/max": 1.1425702571868896, + "sampling/sampling_logp_difference/mean": 0.014155558310449123, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 221.234375, + "completions/mean_terminated_length": 221.234375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3149792551994324, + "epoch": 2.1801470588235294, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.166820660842093, + "kl": 0.1276342123746872, + "learning_rate": 2.1277428542916555e-07, + "loss": -0.0022, + "num_tokens": 56251330.0, + "reward": 0.03125, + "reward_std": 0.5959457159042358, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.7731382846832275, + "sampling/importance_sampling_ratio/mean": 1.0004637241363525, + "sampling/importance_sampling_ratio/min": 0.4084855020046234, + "sampling/sampling_logp_difference/max": 0.8952988386154175, + "sampling/sampling_logp_difference/mean": 0.017157146707177162, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 229.9375, + "completions/mean_terminated_length": 229.9375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2507379651069641, + "epoch": 2.1813725490196076, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7191440763102566, + "kl": 0.06998881697654724, + "learning_rate": 2.121914671571633e-07, + "loss": -0.0332, + "num_tokens": 56280318.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4447228908538818, + "sampling/importance_sampling_ratio/mean": 1.0003588199615479, + "sampling/importance_sampling_ratio/min": 0.5850769281387329, + "sampling/sampling_logp_difference/max": 0.5360119342803955, + "sampling/sampling_logp_difference/mean": 0.013089507818222046, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 159.140625, + "completions/mean_terminated_length": 159.140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.20331966876983643, + "epoch": 2.1825980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047057333445381995, + "kl": 0.07965545356273651, + "learning_rate": 2.1160923311824934e-07, + "loss": 0.0008, + "num_tokens": 56308743.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8327713012695312, + "sampling/importance_sampling_ratio/mean": 0.9995776414871216, + "sampling/importance_sampling_ratio/min": 0.53630131483078, + "sampling/sampling_logp_difference/max": 0.6230590343475342, + "sampling/sampling_logp_difference/mean": 0.012753108516335487, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 170.953125, + "completions/mean_terminated_length": 170.953125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2831561863422394, + "epoch": 2.1838235294117645, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8795245260456575, + "kl": 0.1338336020708084, + "learning_rate": 2.110275844943223e-07, + "loss": -0.0086, + "num_tokens": 56334868.0, + "reward": 0.0, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7142467498779297, + "sampling/importance_sampling_ratio/mean": 1.000351071357727, + "sampling/importance_sampling_ratio/min": 0.6327558159828186, + "sampling/sampling_logp_difference/max": 0.5389738082885742, + "sampling/sampling_logp_difference/mean": 0.015494199469685555, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 143.984375, + "completions/mean_terminated_length": 143.984375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.12787416577339172, + "epoch": 2.185049019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04899348871208142, + "kl": 0.052631497383117676, + "learning_rate": 2.1044652246609173e-07, + "loss": 0.0005, + "num_tokens": 56356595.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002912282943726, + "sampling/importance_sampling_ratio/min": 0.41095978021621704, + "sampling/sampling_logp_difference/max": 0.8892599940299988, + "sampling/sampling_logp_difference/mean": 0.00995919480919838, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 227.453125, + "completions/mean_terminated_length": 227.453125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.24770432710647583, + "epoch": 2.186274509803922, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.850225326570932, + "kl": 0.12453025579452515, + "learning_rate": 2.098660482130768e-07, + "loss": 0.0561, + "num_tokens": 56384320.0, + "reward": 0.6875, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6594297885894775, + "sampling/importance_sampling_ratio/mean": 1.0000152587890625, + "sampling/importance_sampling_ratio/min": 0.5000889301300049, + "sampling/sampling_logp_difference/max": 0.6929693222045898, + "sampling/sampling_logp_difference/mean": 0.013516264036297798, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 153.203125, + "completions/mean_terminated_length": 153.203125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.25445735454559326, + "epoch": 2.1875, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.1408699212596036, + "kl": 0.11505749076604843, + "learning_rate": 2.092861629136033e-07, + "loss": -0.0087, + "num_tokens": 56410637.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6286143064498901, + "sampling/importance_sampling_ratio/mean": 0.9998992681503296, + "sampling/importance_sampling_ratio/min": 0.49341726303100586, + "sampling/sampling_logp_difference/max": 0.7064000964164734, + "sampling/sampling_logp_difference/mean": 0.016297511756420135, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 198.578125, + "completions/mean_terminated_length": 198.578125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2938719391822815, + "epoch": 2.188725490196078, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3957152626390525, + "kl": 0.12690046429634094, + "learning_rate": 2.0870686774480196e-07, + "loss": 0.11, + "num_tokens": 56440898.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.7653018236160278, + "sampling/importance_sampling_ratio/mean": 1.0002646446228027, + "sampling/importance_sampling_ratio/min": 0.5458997488021851, + "sampling/sampling_logp_difference/max": 0.6053199768066406, + "sampling/sampling_logp_difference/mean": 0.016641918569803238, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 219.84375, + "completions/mean_terminated_length": 219.84375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.27959156036376953, + "epoch": 2.189950980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8963628929077574, + "kl": 0.09853015840053558, + "learning_rate": 2.0812816388260519e-07, + "loss": 0.0119, + "num_tokens": 56475800.0, + "reward": 0.3125, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.654282569885254, + "sampling/importance_sampling_ratio/mean": 1.0007174015045166, + "sampling/importance_sampling_ratio/min": 0.5283967852592468, + "sampling/sampling_logp_difference/max": 0.6379077434539795, + "sampling/sampling_logp_difference/mean": 0.014839932322502136, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 204.8125, + "completions/mean_terminated_length": 204.8125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2499319463968277, + "epoch": 2.1911764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0891285918951858, + "kl": 0.09234219789505005, + "learning_rate": 2.0755005250174484e-07, + "loss": 0.0214, + "num_tokens": 56507372.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.9342883825302124, + "sampling/importance_sampling_ratio/mean": 1.000509262084961, + "sampling/importance_sampling_ratio/min": 0.4982292950153351, + "sampling/sampling_logp_difference/max": 0.6966948509216309, + "sampling/sampling_logp_difference/mean": 0.01440890971571207, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 226.0, + "completions/mean_terminated_length": 226.0, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.24660606682300568, + "epoch": 2.1924019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.839575214882834, + "kl": 0.12259334325790405, + "learning_rate": 2.0697253477575088e-07, + "loss": -0.0009, + "num_tokens": 56534348.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.7618639469146729, + "sampling/importance_sampling_ratio/mean": 1.0006442070007324, + "sampling/importance_sampling_ratio/min": 0.6075704097747803, + "sampling/sampling_logp_difference/max": 0.5663723945617676, + "sampling/sampling_logp_difference/mean": 0.014180932193994522, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 230.015625, + "completions/mean_terminated_length": 230.015625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2670382559299469, + "epoch": 2.1936274509803924, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2174885407937754, + "kl": 0.07931780815124512, + "learning_rate": 2.0639561187694733e-07, + "loss": -0.0433, + "num_tokens": 56564813.0, + "reward": 0.1875, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.575170636177063, + "sampling/importance_sampling_ratio/mean": 1.0001091957092285, + "sampling/importance_sampling_ratio/min": 0.6236374378204346, + "sampling/sampling_logp_difference/max": 0.4721860885620117, + "sampling/sampling_logp_difference/mean": 0.012448785826563835, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 202.734375, + "completions/mean_terminated_length": 202.734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2719419598579407, + "epoch": 2.1948529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.4177688377458115, + "kl": 0.1150088682770729, + "learning_rate": 2.0581928497645164e-07, + "loss": -0.0505, + "num_tokens": 56596012.0, + "reward": 0.34375, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.8363903760910034, + "sampling/importance_sampling_ratio/mean": 0.999383270740509, + "sampling/importance_sampling_ratio/min": 0.47761914134025574, + "sampling/sampling_logp_difference/max": 0.7389416694641113, + "sampling/sampling_logp_difference/mean": 0.014551948755979538, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 156.859375, + "completions/mean_terminated_length": 156.859375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.24167387187480927, + "epoch": 2.196078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2773096667814843, + "kl": 0.08351308852434158, + "learning_rate": 2.0524355524417015e-07, + "loss": -0.009, + "num_tokens": 56623939.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7385048866271973, + "sampling/importance_sampling_ratio/mean": 1.0004111528396606, + "sampling/importance_sampling_ratio/min": 0.5637833476066589, + "sampling/sampling_logp_difference/max": 0.5730853080749512, + "sampling/sampling_logp_difference/mean": 0.01441868208348751, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 195.546875, + "completions/mean_terminated_length": 195.546875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.30555734038352966, + "epoch": 2.1973039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1106148320527351, + "kl": 0.11144275963306427, + "learning_rate": 2.0466842384879829e-07, + "loss": -0.0079, + "num_tokens": 56653462.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5820528268814087, + "sampling/importance_sampling_ratio/mean": 1.0003950595855713, + "sampling/importance_sampling_ratio/min": 0.5864598155021667, + "sampling/sampling_logp_difference/max": 0.5336510539054871, + "sampling/sampling_logp_difference/mean": 0.015375595539808273, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 231.625, + "completions/mean_terminated_length": 231.625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.24411731958389282, + "epoch": 2.198529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7208697359651974, + "kl": 0.09210469573736191, + "learning_rate": 2.0409389195781623e-07, + "loss": -0.0595, + "num_tokens": 56686430.0, + "reward": -0.1875, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000260353088379, + "sampling/importance_sampling_ratio/min": 0.5911000967025757, + "sampling/sampling_logp_difference/max": 0.8386600017547607, + "sampling/sampling_logp_difference/mean": 0.013181643560528755, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 164.65625, + "completions/mean_terminated_length": 164.65625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.25683221220970154, + "epoch": 2.1997549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5931127315147342, + "kl": 0.11360817402601242, + "learning_rate": 2.0351996073748713e-07, + "loss": -0.0039, + "num_tokens": 56718152.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000403642654419, + "sampling/importance_sampling_ratio/min": 0.5696611404418945, + "sampling/sampling_logp_difference/max": 0.7048375606536865, + "sampling/sampling_logp_difference/mean": 0.015989817678928375, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 242.453125, + "completions/mean_terminated_length": 242.453125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.26049256324768066, + "epoch": 2.200980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.888650218870308, + "kl": 0.08447247743606567, + "learning_rate": 2.0294663135285533e-07, + "loss": -0.0133, + "num_tokens": 56752853.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.8217499256134033, + "sampling/importance_sampling_ratio/mean": 1.0003026723861694, + "sampling/importance_sampling_ratio/min": 0.6241431832313538, + "sampling/sampling_logp_difference/max": 0.5997974872589111, + "sampling/sampling_logp_difference/mean": 0.013664944097399712, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 182.078125, + "completions/mean_terminated_length": 182.078125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2810284495353699, + "epoch": 2.202205882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9051843705444376, + "kl": 0.12366537749767303, + "learning_rate": 2.0237390496774282e-07, + "loss": 0.0259, + "num_tokens": 56778938.0, + "reward": 0.34375, + "reward_std": 0.4597553312778473, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.9871368408203125, + "sampling/importance_sampling_ratio/mean": 1.0005348920822144, + "sampling/importance_sampling_ratio/min": 0.49550315737724304, + "sampling/sampling_logp_difference/max": 0.7021815776824951, + "sampling/sampling_logp_difference/mean": 0.016560807824134827, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 184.0625, + "completions/mean_terminated_length": 184.0625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.22490039467811584, + "epoch": 2.2034313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056413215260071876, + "kl": 0.07760661840438843, + "learning_rate": 2.0180178274474834e-07, + "loss": 0.0008, + "num_tokens": 56811998.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.900390386581421, + "sampling/importance_sampling_ratio/mean": 0.999333918094635, + "sampling/importance_sampling_ratio/min": 0.5382927656173706, + "sampling/sampling_logp_difference/max": 0.642059326171875, + "sampling/sampling_logp_difference/mean": 0.013159316033124924, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 158.15625, + "completions/mean_terminated_length": 158.15625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.24703152477741241, + "epoch": 2.204656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5164389722404052, + "kl": 0.07044568657875061, + "learning_rate": 2.012302658452432e-07, + "loss": 0.021, + "num_tokens": 56837528.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.636584758758545, + "sampling/importance_sampling_ratio/mean": 0.9996676445007324, + "sampling/importance_sampling_ratio/min": 0.4209079146385193, + "sampling/sampling_logp_difference/max": 0.8653411865234375, + "sampling/sampling_logp_difference/mean": 0.013793625868856907, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 175.0625, + "completions/mean_terminated_length": 175.0625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.26092529296875, + "epoch": 2.2058823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0319771400909987, + "kl": 0.12363815307617188, + "learning_rate": 2.0065935542937073e-07, + "loss": -0.0257, + "num_tokens": 56865660.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.58610200881958, + "sampling/importance_sampling_ratio/mean": 1.0004420280456543, + "sampling/importance_sampling_ratio/min": 0.5681842565536499, + "sampling/sampling_logp_difference/max": 0.5653095245361328, + "sampling/sampling_logp_difference/mean": 0.014487011358141899, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 222.96875, + "completions/mean_terminated_length": 222.96875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.23370033502578735, + "epoch": 2.207107843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05426284143562628, + "kl": 0.08036220073699951, + "learning_rate": 2.0008905265604315e-07, + "loss": 0.0007, + "num_tokens": 56899162.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.767967700958252, + "sampling/importance_sampling_ratio/mean": 1.0001204013824463, + "sampling/importance_sampling_ratio/min": 0.2905968427658081, + "sampling/sampling_logp_difference/max": 1.2358183860778809, + "sampling/sampling_logp_difference/mean": 0.013781680725514889, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 167.640625, + "completions/mean_terminated_length": 167.640625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.21038201451301575, + "epoch": 2.2083333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7168966777548178, + "kl": 0.08649880439043045, + "learning_rate": 1.995193586829387e-07, + "loss": -0.0401, + "num_tokens": 56923827.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6357125043869019, + "sampling/importance_sampling_ratio/mean": 1.0007596015930176, + "sampling/importance_sampling_ratio/min": 0.6269845962524414, + "sampling/sampling_logp_difference/max": 0.4920785427093506, + "sampling/sampling_logp_difference/mean": 0.012823158875107765, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 204.765625, + "completions/mean_terminated_length": 204.765625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3014012575149536, + "epoch": 2.2095588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7992363173722363, + "kl": 0.1286182552576065, + "learning_rate": 1.989502746665001e-07, + "loss": -0.0384, + "num_tokens": 56951876.0, + "reward": -0.28125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.4636856317520142, + "sampling/importance_sampling_ratio/mean": 0.9995325207710266, + "sampling/importance_sampling_ratio/min": 0.6095805764198303, + "sampling/sampling_logp_difference/max": 0.49498414993286133, + "sampling/sampling_logp_difference/mean": 0.014365926384925842, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 198.453125, + "completions/mean_terminated_length": 198.453125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.21758437156677246, + "epoch": 2.2107843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.325185412232773, + "kl": 0.08413703739643097, + "learning_rate": 1.9838180176193176e-07, + "loss": 0.0533, + "num_tokens": 56992097.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.690238118171692, + "sampling/importance_sampling_ratio/mean": 1.0002226829528809, + "sampling/importance_sampling_ratio/min": 0.29751691222190857, + "sampling/sampling_logp_difference/max": 1.2122842073440552, + "sampling/sampling_logp_difference/mean": 0.013783496804535389, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 175.25, + "completions/mean_terminated_length": 175.25, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.19069215655326843, + "epoch": 2.2120098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5874936688245536, + "kl": 0.06194804608821869, + "learning_rate": 1.9781394112319787e-07, + "loss": -0.1226, + "num_tokens": 57017713.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6393591165542603, + "sampling/importance_sampling_ratio/mean": 0.999832272529602, + "sampling/importance_sampling_ratio/min": 0.6047071814537048, + "sampling/sampling_logp_difference/max": 0.5030109882354736, + "sampling/sampling_logp_difference/mean": 0.012021776288747787, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 212.65625, + "completions/mean_terminated_length": 212.65625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.36637696623802185, + "epoch": 2.213235294117647, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0582492288032177, + "kl": 0.13019004464149475, + "learning_rate": 1.9724669390301946e-07, + "loss": 0.037, + "num_tokens": 57053819.0, + "reward": 0.46875, + "reward_std": 0.8987700343132019, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5888125896453857, + "sampling/importance_sampling_ratio/mean": 0.9998255372047424, + "sampling/importance_sampling_ratio/min": 0.5999937057495117, + "sampling/sampling_logp_difference/max": 0.510836124420166, + "sampling/sampling_logp_difference/mean": 0.01723702810704708, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 165.5625, + "completions/mean_terminated_length": 165.5625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2223309427499771, + "epoch": 2.2144607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5234672161510483, + "kl": 0.10448633879423141, + "learning_rate": 1.9668006125287228e-07, + "loss": 0.0074, + "num_tokens": 57079439.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.608222246170044, + "sampling/importance_sampling_ratio/mean": 0.9996899366378784, + "sampling/importance_sampling_ratio/min": 0.5366796851158142, + "sampling/sampling_logp_difference/max": 0.6223537921905518, + "sampling/sampling_logp_difference/mean": 0.014071298763155937, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 266.59375, + "completions/mean_terminated_length": 266.59375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2737465500831604, + "epoch": 2.215686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5541124850224726, + "kl": 0.06044057011604309, + "learning_rate": 1.96114044322985e-07, + "loss": 0.1126, + "num_tokens": 57111269.0, + "reward": 0.875, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.695718765258789, + "sampling/importance_sampling_ratio/mean": 0.9998077154159546, + "sampling/importance_sampling_ratio/min": 0.537212073802948, + "sampling/sampling_logp_difference/max": 0.6213623285293579, + "sampling/sampling_logp_difference/mean": 0.01348542608320713, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 179.71875, + "completions/mean_terminated_length": 179.71875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.21518371999263763, + "epoch": 2.2169117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3877720786346066, + "kl": 0.06830859929323196, + "learning_rate": 1.9554864426233604e-07, + "loss": 0.0498, + "num_tokens": 57136307.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6554591655731201, + "sampling/importance_sampling_ratio/mean": 0.999778151512146, + "sampling/importance_sampling_ratio/min": 0.26966822147369385, + "sampling/sampling_logp_difference/max": 1.3105628490447998, + "sampling/sampling_logp_difference/mean": 0.012849608436226845, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 153.09375, + "completions/mean_terminated_length": 153.09375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.23742112517356873, + "epoch": 2.218137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.4757763573259473, + "kl": 0.1102885976433754, + "learning_rate": 1.9498386221865165e-07, + "loss": 0.0117, + "num_tokens": 57159529.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5744246244430542, + "sampling/importance_sampling_ratio/mean": 1.000669240951538, + "sampling/importance_sampling_ratio/min": 0.605816125869751, + "sampling/sampling_logp_difference/max": 0.5011787414550781, + "sampling/sampling_logp_difference/mean": 0.014759579673409462, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 220.546875, + "completions/mean_terminated_length": 220.546875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.27240610122680664, + "epoch": 2.219362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06490112940719692, + "kl": 0.12128046154975891, + "learning_rate": 1.944196993384034e-07, + "loss": 0.0011, + "num_tokens": 57196924.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6916193962097168, + "sampling/importance_sampling_ratio/mean": 1.000105857849121, + "sampling/importance_sampling_ratio/min": 0.6148074865341187, + "sampling/sampling_logp_difference/max": 0.5256862640380859, + "sampling/sampling_logp_difference/mean": 0.017099231481552124, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 162.5625, + "completions/mean_terminated_length": 162.5625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.29277801513671875, + "epoch": 2.2205882352941178, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.493362889973553, + "kl": 0.13007113337516785, + "learning_rate": 1.9385615676680661e-07, + "loss": -0.0134, + "num_tokens": 57223968.0, + "reward": 0.5625, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4952139854431152, + "sampling/importance_sampling_ratio/mean": 1.0002390146255493, + "sampling/importance_sampling_ratio/min": 0.4914461374282837, + "sampling/sampling_logp_difference/max": 0.7104029655456543, + "sampling/sampling_logp_difference/mean": 0.015847956761717796, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 208.328125, + "completions/mean_terminated_length": 208.328125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.24020692706108093, + "epoch": 2.221813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07127336127746393, + "kl": 0.12844060361385345, + "learning_rate": 1.932932356478168e-07, + "loss": 0.001, + "num_tokens": 57253189.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5585031509399414, + "sampling/importance_sampling_ratio/mean": 1.0003728866577148, + "sampling/importance_sampling_ratio/min": 0.33361685276031494, + "sampling/sampling_logp_difference/max": 1.097762107849121, + "sampling/sampling_logp_difference/mean": 0.012762902304530144, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 173.203125, + "completions/mean_terminated_length": 173.203125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2723970413208008, + "epoch": 2.2230392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1579952625133267, + "kl": 0.1221776232123375, + "learning_rate": 1.9273093712412796e-07, + "loss": 0.0255, + "num_tokens": 57281970.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.7064048051834106, + "sampling/importance_sampling_ratio/mean": 0.9998112916946411, + "sampling/importance_sampling_ratio/min": 0.22986814379692078, + "sampling/sampling_logp_difference/max": 1.4702494144439697, + "sampling/sampling_logp_difference/mean": 0.0177859365940094, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 185.4375, + "completions/mean_terminated_length": 185.4375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.23902729153633118, + "epoch": 2.224264705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.202025891469844, + "kl": 0.09321679174900055, + "learning_rate": 1.9216926233717084e-07, + "loss": 0.0081, + "num_tokens": 57313502.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999767541885376, + "sampling/importance_sampling_ratio/min": 0.3941425383090973, + "sampling/sampling_logp_difference/max": 0.9310426712036133, + "sampling/sampling_logp_difference/mean": 0.013515518978238106, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 199.25, + "completions/mean_terminated_length": 199.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2674904465675354, + "epoch": 2.2254901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.050382867341168, + "kl": 0.09484975039958954, + "learning_rate": 1.9160821242710957e-07, + "loss": -0.0138, + "num_tokens": 57342350.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6207305192947388, + "sampling/importance_sampling_ratio/mean": 1.0005056858062744, + "sampling/importance_sampling_ratio/min": 0.6129491329193115, + "sampling/sampling_logp_difference/max": 0.4894733428955078, + "sampling/sampling_logp_difference/mean": 0.013506803661584854, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 227.984375, + "completions/mean_terminated_length": 227.984375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.24355986714363098, + "epoch": 2.2267156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2725728688716063, + "kl": 0.08003943413496017, + "learning_rate": 1.9104778853283987e-07, + "loss": 0.0237, + "num_tokens": 57374413.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.8314207792282104, + "sampling/importance_sampling_ratio/mean": 1.0002812147140503, + "sampling/importance_sampling_ratio/min": 0.6203051209449768, + "sampling/sampling_logp_difference/max": 0.6050920486450195, + "sampling/sampling_logp_difference/mean": 0.013915604911744595, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 189.109375, + "completions/mean_terminated_length": 189.109375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3851677179336548, + "epoch": 2.2279411764705883, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4054685081200353, + "kl": 0.19514328241348267, + "learning_rate": 1.9048799179198655e-07, + "loss": -0.0561, + "num_tokens": 57400340.0, + "reward": -0.25, + "reward_std": 0.6972135901451111, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5971453189849854, + "sampling/importance_sampling_ratio/mean": 0.9996978640556335, + "sampling/importance_sampling_ratio/min": 0.4955294728279114, + "sampling/sampling_logp_difference/max": 0.7021284103393555, + "sampling/sampling_logp_difference/mean": 0.018159667029976845, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 185.953125, + "completions/mean_terminated_length": 185.953125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.1858978122472763, + "epoch": 2.2291666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7616215753387285, + "kl": 0.07825511693954468, + "learning_rate": 1.8992882334090188e-07, + "loss": 0.0884, + "num_tokens": 57426929.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000319242477417, + "sampling/importance_sampling_ratio/min": 0.12653733789920807, + "sampling/sampling_logp_difference/max": 2.0672178268432617, + "sampling/sampling_logp_difference/mean": 0.012095373123884201, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 189.46875, + "completions/mean_terminated_length": 189.46875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2210862636566162, + "epoch": 2.230392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0660557656364722, + "kl": 0.06625708937644958, + "learning_rate": 1.893702843146623e-07, + "loss": 0.0354, + "num_tokens": 57459279.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5971460342407227, + "sampling/importance_sampling_ratio/mean": 0.9999723434448242, + "sampling/importance_sampling_ratio/min": 0.48653677105903625, + "sampling/sampling_logp_difference/max": 0.7204427719116211, + "sampling/sampling_logp_difference/mean": 0.012377100065350533, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 183.109375, + "completions/mean_terminated_length": 183.109375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.2027810513973236, + "epoch": 2.2316176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09053463841587767, + "kl": 0.06789502501487732, + "learning_rate": 1.8881237584706632e-07, + "loss": 0.0006, + "num_tokens": 57487414.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.627347469329834, + "sampling/importance_sampling_ratio/mean": 0.9995033740997314, + "sampling/importance_sampling_ratio/min": 0.4853755533695221, + "sampling/sampling_logp_difference/max": 0.7228323221206665, + "sampling/sampling_logp_difference/mean": 0.01381603628396988, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 197.90625, + "completions/mean_terminated_length": 197.90625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.2434486746788025, + "epoch": 2.232843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.344361640714432, + "kl": 0.07310117781162262, + "learning_rate": 1.8825509907063326e-07, + "loss": -0.0194, + "num_tokens": 57515728.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5562864542007446, + "sampling/importance_sampling_ratio/mean": 0.999854564666748, + "sampling/importance_sampling_ratio/min": 0.5838513374328613, + "sampling/sampling_logp_difference/max": 0.5381088256835938, + "sampling/sampling_logp_difference/mean": 0.013278448022902012, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 5000.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 301.078125, + "completions/mean_terminated_length": 226.49208068847656, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2564546465873718, + "epoch": 2.2340686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7483155096265234, + "kl": 0.08983118087053299, + "learning_rate": 1.8769845511659927e-07, + "loss": 0.4925, + "num_tokens": 57553253.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.5528501868247986, + "sampling/sampling_logp_difference/max": 0.79681396484375, + "sampling/sampling_logp_difference/mean": 0.013430179096758366, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 174.96875, + "completions/mean_terminated_length": 174.96875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2382800132036209, + "epoch": 2.235294117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1293663149804734, + "kl": 0.10047850757837296, + "learning_rate": 1.871424451149169e-07, + "loss": 0.0514, + "num_tokens": 57582243.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.62760591506958, + "sampling/importance_sampling_ratio/mean": 0.9997063875198364, + "sampling/importance_sampling_ratio/min": 0.5972736477851868, + "sampling/sampling_logp_difference/max": 0.5153799057006836, + "sampling/sampling_logp_difference/mean": 0.014239577576518059, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 235.890625, + "completions/mean_terminated_length": 235.890625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.30149415135383606, + "epoch": 2.236519607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9845678835254071, + "kl": 0.15238609910011292, + "learning_rate": 1.865870701942504e-07, + "loss": -0.0024, + "num_tokens": 57620396.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6185318231582642, + "sampling/importance_sampling_ratio/mean": 1.0003349781036377, + "sampling/importance_sampling_ratio/min": 0.420204222202301, + "sampling/sampling_logp_difference/max": 0.8670144081115723, + "sampling/sampling_logp_difference/mean": 0.01578395627439022, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 203.4375, + "completions/mean_terminated_length": 203.4375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2554306387901306, + "epoch": 2.2377450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07020997658094677, + "kl": 0.11225457489490509, + "learning_rate": 1.8603233148197632e-07, + "loss": 0.0011, + "num_tokens": 57652008.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5684692859649658, + "sampling/importance_sampling_ratio/mean": 0.9999103546142578, + "sampling/importance_sampling_ratio/min": 0.6260828971862793, + "sampling/sampling_logp_difference/max": 0.46827244758605957, + "sampling/sampling_logp_difference/mean": 0.01270446926355362, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 213.203125, + "completions/mean_terminated_length": 213.203125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2989576458930969, + "epoch": 2.238970588235294, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.816139110366677, + "kl": 0.1348409354686737, + "learning_rate": 1.8547823010417873e-07, + "loss": -0.0742, + "num_tokens": 57680149.0, + "reward": 0.40625, + "reward_std": 0.6205305457115173, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6038249731063843, + "sampling/importance_sampling_ratio/mean": 1.0002083778381348, + "sampling/importance_sampling_ratio/min": 0.577558159828186, + "sampling/sampling_logp_difference/max": 0.5489461421966553, + "sampling/sampling_logp_difference/mean": 0.014283552765846252, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 222.828125, + "completions/mean_terminated_length": 222.828125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2774503529071808, + "epoch": 2.2401960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0941952484861721, + "kl": 0.10034649819135666, + "learning_rate": 1.8492476718564866e-07, + "loss": 0.0006, + "num_tokens": 57713482.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5491840839385986, + "sampling/importance_sampling_ratio/mean": 1.0000077486038208, + "sampling/importance_sampling_ratio/min": 0.4728690981864929, + "sampling/sampling_logp_difference/max": 0.748936653137207, + "sampling/sampling_logp_difference/mean": 0.014115766622126102, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 246.484375, + "completions/mean_terminated_length": 246.484375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2916295528411865, + "epoch": 2.241421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06366216926669424, + "kl": 0.12240947037935257, + "learning_rate": 1.8437194384988058e-07, + "loss": 0.0011, + "num_tokens": 57747209.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999750018119812, + "sampling/importance_sampling_ratio/min": 0.32487982511520386, + "sampling/sampling_logp_difference/max": 1.1242998838424683, + "sampling/sampling_logp_difference/mean": 0.015239045023918152, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 157.109375, + "completions/mean_terminated_length": 157.109375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.21737360954284668, + "epoch": 2.2426470588235294, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5352442426278357, + "kl": 0.08680347353219986, + "learning_rate": 1.8381976121907067e-07, + "loss": 0.0733, + "num_tokens": 57772976.0, + "reward": 0.46875, + "reward_std": 0.5281128883361816, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4428476095199585, + "sampling/importance_sampling_ratio/mean": 0.9994739294052124, + "sampling/importance_sampling_ratio/min": 0.5038317441940308, + "sampling/sampling_logp_difference/max": 0.6855130195617676, + "sampling/sampling_logp_difference/mean": 0.012364721857011318, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 239.1875, + "completions/mean_terminated_length": 239.1875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.18108172714710236, + "epoch": 2.2438725490196076, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.578981437829289, + "kl": 0.07498110085725784, + "learning_rate": 1.832682204141152e-07, + "loss": 0.0199, + "num_tokens": 57806796.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7198060750961304, + "sampling/importance_sampling_ratio/mean": 1.0001530647277832, + "sampling/importance_sampling_ratio/min": 0.5394806265830994, + "sampling/sampling_logp_difference/max": 0.6171483993530273, + "sampling/sampling_logp_difference/mean": 0.011187737807631493, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 155.796875, + "completions/mean_terminated_length": 155.796875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2116914689540863, + "epoch": 2.2450980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06012952864614764, + "kl": 0.09823980182409286, + "learning_rate": 1.8271732255460643e-07, + "loss": 0.001, + "num_tokens": 57834287.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8599026203155518, + "sampling/importance_sampling_ratio/mean": 0.9994688034057617, + "sampling/importance_sampling_ratio/min": 0.6174707412719727, + "sampling/sampling_logp_difference/max": 0.6205241680145264, + "sampling/sampling_logp_difference/mean": 0.012800629250705242, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 261.015625, + "completions/mean_terminated_length": 261.015625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.27193963527679443, + "epoch": 2.2463235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5690034865510771, + "kl": 0.05566692352294922, + "learning_rate": 1.8216706875883252e-07, + "loss": -0.0431, + "num_tokens": 57868048.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.595842719078064, + "sampling/importance_sampling_ratio/mean": 0.9992786645889282, + "sampling/importance_sampling_ratio/min": 0.6169748306274414, + "sampling/sampling_logp_difference/max": 0.4829270839691162, + "sampling/sampling_logp_difference/mean": 0.014392347075045109, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 193.40625, + "completions/mean_terminated_length": 193.40625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.21245065331459045, + "epoch": 2.247549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.239601729534302, + "kl": 0.06684806942939758, + "learning_rate": 1.816174601437736e-07, + "loss": -0.0558, + "num_tokens": 57899866.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7529635429382324, + "sampling/importance_sampling_ratio/mean": 1.0001167058944702, + "sampling/importance_sampling_ratio/min": 0.54033362865448, + "sampling/sampling_logp_difference/max": 0.6155685782432556, + "sampling/sampling_logp_difference/mean": 0.012661349959671497, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 248.296875, + "completions/mean_terminated_length": 248.296875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3864680528640747, + "epoch": 2.248774509803922, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7114122116645036, + "kl": 0.14726883172988892, + "learning_rate": 1.8106849782510058e-07, + "loss": -0.0014, + "num_tokens": 57933629.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4798548221588135, + "sampling/importance_sampling_ratio/mean": 0.9994147419929504, + "sampling/importance_sampling_ratio/min": 0.47067582607269287, + "sampling/sampling_logp_difference/max": 0.7535857558250427, + "sampling/sampling_logp_difference/mean": 0.018671337515115738, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 192.578125, + "completions/mean_terminated_length": 192.578125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3123793601989746, + "epoch": 2.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05289987181211492, + "kl": 0.09484916180372238, + "learning_rate": 1.8052018291717215e-07, + "loss": 0.001, + "num_tokens": 57969298.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8810420036315918, + "sampling/importance_sampling_ratio/mean": 1.0005704164505005, + "sampling/importance_sampling_ratio/min": 0.5695523619651794, + "sampling/sampling_logp_difference/max": 0.6318259239196777, + "sampling/sampling_logp_difference/mean": 0.017064901068806648, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 231.25, + "completions/mean_terminated_length": 231.25, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.2550053596496582, + "epoch": 2.251225490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03590646972688233, + "kl": 0.06980462372303009, + "learning_rate": 1.7997251653303247e-07, + "loss": 0.0007, + "num_tokens": 58004690.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002665519714355, + "sampling/importance_sampling_ratio/min": 0.48348575830459595, + "sampling/sampling_logp_difference/max": 1.2188178300857544, + "sampling/sampling_logp_difference/mean": 0.013812856748700142, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 202.46875, + "completions/mean_terminated_length": 202.46875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.23180857300758362, + "epoch": 2.252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04306650140612095, + "kl": 0.11366766691207886, + "learning_rate": 1.7942549978441012e-07, + "loss": 0.0011, + "num_tokens": 58036768.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8106759786605835, + "sampling/importance_sampling_ratio/mean": 1.0001732110977173, + "sampling/importance_sampling_ratio/min": 0.32732874155044556, + "sampling/sampling_logp_difference/max": 1.1167902946472168, + "sampling/sampling_logp_difference/mean": 0.013028960675001144, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 210.15625, + "completions/mean_terminated_length": 210.15625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2635669708251953, + "epoch": 2.2536764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4666512103923888, + "kl": 0.09035992622375488, + "learning_rate": 1.7887913378171422e-07, + "loss": 0.0518, + "num_tokens": 58065034.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.7100659608840942, + "sampling/importance_sampling_ratio/mean": 1.000524878501892, + "sampling/importance_sampling_ratio/min": 0.5668810606002808, + "sampling/sampling_logp_difference/max": 0.56760573387146, + "sampling/sampling_logp_difference/mean": 0.01479196548461914, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 236.8125, + "completions/mean_terminated_length": 236.8125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.22833718359470367, + "epoch": 2.2549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2875345140570835, + "kl": 0.07793845236301422, + "learning_rate": 1.783334196340331e-07, + "loss": -0.0267, + "num_tokens": 58098766.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996142983436584, + "sampling/importance_sampling_ratio/min": 0.5325475335121155, + "sampling/sampling_logp_difference/max": 0.8491120338439941, + "sampling/sampling_logp_difference/mean": 0.012790380977094173, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 158.609375, + "completions/mean_terminated_length": 158.609375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.19125020503997803, + "epoch": 2.256127450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06275535771901054, + "kl": 0.07269306480884552, + "learning_rate": 1.777883584491317e-07, + "loss": 0.0007, + "num_tokens": 58122853.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.574523687362671, + "sampling/importance_sampling_ratio/mean": 0.9985629320144653, + "sampling/importance_sampling_ratio/min": 0.3798008859157562, + "sampling/sampling_logp_difference/max": 0.9681081771850586, + "sampling/sampling_logp_difference/mean": 0.012079785577952862, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 170.875, + "completions/mean_terminated_length": 170.875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.21966275572776794, + "epoch": 2.2573529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2274985334583557, + "kl": 0.122444286942482, + "learning_rate": 1.7724395133345022e-07, + "loss": -0.0055, + "num_tokens": 58154525.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 1.0001604557037354, + "sampling/importance_sampling_ratio/min": 0.5928127765655518, + "sampling/sampling_logp_difference/max": 0.5228766202926636, + "sampling/sampling_logp_difference/mean": 0.012999025173485279, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 145.28125, + "completions/mean_terminated_length": 145.28125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.2318025827407837, + "epoch": 2.258578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057211100450288946, + "kl": 0.13506409525871277, + "learning_rate": 1.7670019939210023e-07, + "loss": 0.0013, + "num_tokens": 58179919.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9647648334503174, + "sampling/importance_sampling_ratio/mean": 1.0004273653030396, + "sampling/importance_sampling_ratio/min": 0.5388311743736267, + "sampling/sampling_logp_difference/max": 0.6753726005554199, + "sampling/sampling_logp_difference/mean": 0.013093828223645687, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.22805750370025635, + "epoch": 2.2598039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7208807686495529, + "kl": 0.1852080225944519, + "learning_rate": 1.761571037288637e-07, + "loss": -0.0331, + "num_tokens": 58206191.0, + "reward": -0.21875, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000075101852417, + "sampling/importance_sampling_ratio/min": 0.18973375856876373, + "sampling/sampling_logp_difference/max": 1.6621334552764893, + "sampling/sampling_logp_difference/mean": 0.014221054501831532, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 223.0625, + "completions/mean_terminated_length": 223.0625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.318605899810791, + "epoch": 2.261029411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9910644140403233, + "kl": 0.12354724109172821, + "learning_rate": 1.7561466544619076e-07, + "loss": -0.0009, + "num_tokens": 58244115.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.8127728700637817, + "sampling/importance_sampling_ratio/mean": 1.0001894235610962, + "sampling/importance_sampling_ratio/min": 0.29515746235847473, + "sampling/sampling_logp_difference/max": 1.2202463150024414, + "sampling/sampling_logp_difference/mean": 0.01690426468849182, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 161.515625, + "completions/mean_terminated_length": 161.515625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.16774044930934906, + "epoch": 2.2622549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1828802332373084, + "kl": 0.06102241948246956, + "learning_rate": 1.7507288564519646e-07, + "loss": 0.0006, + "num_tokens": 58268484.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7375820875167847, + "sampling/importance_sampling_ratio/mean": 0.9996050596237183, + "sampling/importance_sampling_ratio/min": 0.509118914604187, + "sampling/sampling_logp_difference/max": 0.6750736236572266, + "sampling/sampling_logp_difference/mean": 0.01106603629887104, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 244.546875, + "completions/mean_terminated_length": 244.546875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.305085688829422, + "epoch": 2.263480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0515456690053895, + "kl": 0.10104508697986603, + "learning_rate": 1.7453176542565956e-07, + "loss": 0.0106, + "num_tokens": 58308855.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.7150546312332153, + "sampling/importance_sampling_ratio/mean": 1.000779151916504, + "sampling/importance_sampling_ratio/min": 0.6202815771102905, + "sampling/sampling_logp_difference/max": 0.5394449234008789, + "sampling/sampling_logp_difference/mean": 0.014966677874326706, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 998.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 226.921875, + "completions/mean_terminated_length": 226.921875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.27462872862815857, + "epoch": 2.264705882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.323226474562783, + "kl": 0.0892459824681282, + "learning_rate": 1.7399130588601968e-07, + "loss": -0.0757, + "num_tokens": 58348738.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999624490737915, + "sampling/importance_sampling_ratio/min": 0.43417733907699585, + "sampling/sampling_logp_difference/max": 0.8811070919036865, + "sampling/sampling_logp_difference/mean": 0.015549260191619396, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 243.796875, + "completions/mean_terminated_length": 243.796875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.21817326545715332, + "epoch": 2.2659313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.179961487129024, + "kl": 0.07068933546543121, + "learning_rate": 1.7345150812337562e-07, + "loss": 0.0101, + "num_tokens": 58382693.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4933048486709595, + "sampling/importance_sampling_ratio/mean": 1.0000038146972656, + "sampling/importance_sampling_ratio/min": 0.6117191910743713, + "sampling/sampling_logp_difference/max": 0.4914819598197937, + "sampling/sampling_logp_difference/mean": 0.012668643146753311, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 333.515625, + "completions/mean_terminated_length": 333.515625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.3151954710483551, + "epoch": 2.267156862745098, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5045878073237393, + "kl": 0.12498971819877625, + "learning_rate": 1.7291237323348284e-07, + "loss": 0.0304, + "num_tokens": 58420150.0, + "reward": 0.3125, + "reward_std": 0.843070387840271, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.709483027458191, + "sampling/importance_sampling_ratio/mean": 1.000072717666626, + "sampling/importance_sampling_ratio/min": 0.2756160795688629, + "sampling/sampling_logp_difference/max": 1.2887464761734009, + "sampling/sampling_logp_difference/mean": 0.014403359033167362, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 202.65625, + "completions/mean_terminated_length": 202.65625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.22938711941242218, + "epoch": 2.2683823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12281090556332791, + "kl": 0.06494831293821335, + "learning_rate": 1.7237390231075055e-07, + "loss": 0.0006, + "num_tokens": 58456352.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004684925079346, + "sampling/importance_sampling_ratio/min": 0.5263667106628418, + "sampling/sampling_logp_difference/max": 0.751194953918457, + "sampling/sampling_logp_difference/mean": 0.015168210491538048, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 204.734375, + "completions/mean_terminated_length": 204.734375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.26925528049468994, + "epoch": 2.269607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.491373261845078, + "kl": 0.08605523407459259, + "learning_rate": 1.7183609644824092e-07, + "loss": 0.0395, + "num_tokens": 58488335.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.5362949371337891, + "sampling/sampling_logp_difference/max": 0.8254508972167969, + "sampling/sampling_logp_difference/mean": 0.015716159716248512, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 259.296875, + "completions/mean_terminated_length": 259.296875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3036905825138092, + "epoch": 2.2708333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.086729544282273, + "kl": 0.14409559965133667, + "learning_rate": 1.7129895673766575e-07, + "loss": -0.0287, + "num_tokens": 58521794.0, + "reward": 0.125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.8718515634536743, + "sampling/importance_sampling_ratio/mean": 0.9997403621673584, + "sampling/importance_sampling_ratio/min": 0.5336452722549438, + "sampling/sampling_logp_difference/max": 0.6280239820480347, + "sampling/sampling_logp_difference/mean": 0.014530260115861893, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 219.9375, + "completions/mean_terminated_length": 219.9375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.20145106315612793, + "epoch": 2.2720588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04183860565817975, + "kl": 0.06628386676311493, + "learning_rate": 1.707624842693844e-07, + "loss": 0.0006, + "num_tokens": 58561774.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8742541074752808, + "sampling/importance_sampling_ratio/mean": 0.9997987151145935, + "sampling/importance_sampling_ratio/min": 0.47087979316711426, + "sampling/sampling_logp_difference/max": 0.7531524300575256, + "sampling/sampling_logp_difference/mean": 0.012259380891919136, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.26910310983657837, + "epoch": 2.2732843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1433446537160958, + "kl": 0.12811408936977386, + "learning_rate": 1.7022668013240227e-07, + "loss": 0.053, + "num_tokens": 58593390.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4334418773651123, + "sampling/importance_sampling_ratio/mean": 0.9996029138565063, + "sampling/importance_sampling_ratio/min": 0.6056228876113892, + "sampling/sampling_logp_difference/max": 0.501497745513916, + "sampling/sampling_logp_difference/mean": 0.013298267498612404, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 171.453125, + "completions/mean_terminated_length": 171.453125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2976323068141937, + "epoch": 2.2745098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046461394156045846, + "kl": 0.08296193182468414, + "learning_rate": 1.696915454143676e-07, + "loss": 0.0009, + "num_tokens": 58620891.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9882467985153198, + "sampling/importance_sampling_ratio/mean": 1.000699520111084, + "sampling/importance_sampling_ratio/min": 0.6074444055557251, + "sampling/sampling_logp_difference/max": 0.6872532367706299, + "sampling/sampling_logp_difference/mean": 0.015259575098752975, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 216.765625, + "completions/mean_terminated_length": 216.765625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.2673759460449219, + "epoch": 2.275735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1585872761982534, + "kl": 0.09247661381959915, + "learning_rate": 1.691570812015704e-07, + "loss": -0.0115, + "num_tokens": 58653580.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6603155136108398, + "sampling/importance_sampling_ratio/mean": 1.0005006790161133, + "sampling/importance_sampling_ratio/min": 0.6128051280975342, + "sampling/sampling_logp_difference/max": 0.5070075988769531, + "sampling/sampling_logp_difference/mean": 0.014402111992239952, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 176.5, + "completions/mean_terminated_length": 176.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.23475946485996246, + "epoch": 2.2769607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2278866171460356, + "kl": 0.07313594222068787, + "learning_rate": 1.6862328857893855e-07, + "loss": -0.0231, + "num_tokens": 58681596.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7771425247192383, + "sampling/importance_sampling_ratio/mean": 1.0007569789886475, + "sampling/importance_sampling_ratio/min": 0.5250598788261414, + "sampling/sampling_logp_difference/max": 0.6442430019378662, + "sampling/sampling_logp_difference/mean": 0.014543603174388409, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 232.15625, + "completions/mean_terminated_length": 232.15625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.30302685499191284, + "epoch": 2.278186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2608571469594911, + "kl": 0.10338533669710159, + "learning_rate": 1.680901686300376e-07, + "loss": -0.0276, + "num_tokens": 58718358.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5953242778778076, + "sampling/importance_sampling_ratio/mean": 0.9996898770332336, + "sampling/importance_sampling_ratio/min": 0.512139618396759, + "sampling/sampling_logp_difference/max": 0.6691579818725586, + "sampling/sampling_logp_difference/mean": 0.016415957361459732, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 208.25, + "completions/mean_terminated_length": 208.25, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.29900407791137695, + "epoch": 2.2794117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7167735694978379, + "kl": 0.0868486762046814, + "learning_rate": 1.6755772243706712e-07, + "loss": 0.0286, + "num_tokens": 58749542.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8058955669403076, + "sampling/importance_sampling_ratio/mean": 1.0002375841140747, + "sampling/importance_sampling_ratio/min": 0.48686483502388, + "sampling/sampling_logp_difference/max": 0.719768762588501, + "sampling/sampling_logp_difference/mean": 0.015454989857971668, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 212.6875, + "completions/mean_terminated_length": 212.6875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2853885889053345, + "epoch": 2.280637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.1323403182520444, + "kl": 0.11518781632184982, + "learning_rate": 1.6702595108085942e-07, + "loss": 0.0141, + "num_tokens": 58784290.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004379749298096, + "sampling/importance_sampling_ratio/min": 0.42395901679992676, + "sampling/sampling_logp_difference/max": 4.34964656829834, + "sampling/sampling_logp_difference/mean": 0.01463394146412611, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 228.96875, + "completions/mean_terminated_length": 228.96875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.31048333644866943, + "epoch": 2.281862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.115790642059336, + "kl": 0.08031786978244781, + "learning_rate": 1.6649485564087644e-07, + "loss": 0.0033, + "num_tokens": 58819760.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6257354021072388, + "sampling/importance_sampling_ratio/mean": 0.999483048915863, + "sampling/importance_sampling_ratio/min": 0.4408201575279236, + "sampling/sampling_logp_difference/max": 0.8191182613372803, + "sampling/sampling_logp_difference/mean": 0.016958583146333694, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 208.4375, + "completions/mean_terminated_length": 208.4375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2439727485179901, + "epoch": 2.2830882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5425184646331114, + "kl": 0.08408606052398682, + "learning_rate": 1.6596443719520826e-07, + "loss": -0.0401, + "num_tokens": 58849420.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6181564331054688, + "sampling/importance_sampling_ratio/mean": 0.9995671510696411, + "sampling/importance_sampling_ratio/min": 0.6246147751808167, + "sampling/sampling_logp_difference/max": 0.48128747940063477, + "sampling/sampling_logp_difference/mean": 0.014026173390448093, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 187.640625, + "completions/mean_terminated_length": 187.640625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.19454039633274078, + "epoch": 2.284313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0376193048901173, + "kl": 0.06733225286006927, + "learning_rate": 1.6543469682057104e-07, + "loss": 0.0007, + "num_tokens": 58878197.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5423756837844849, + "sampling/importance_sampling_ratio/mean": 0.9997547268867493, + "sampling/importance_sampling_ratio/min": 0.6241891980171204, + "sampling/sampling_logp_difference/max": 0.471301794052124, + "sampling/sampling_logp_difference/mean": 0.012020319700241089, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 206.15625, + "completions/mean_terminated_length": 206.15625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.21993784606456757, + "epoch": 2.2855392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2506955861577824, + "kl": 0.08632819354534149, + "learning_rate": 1.6490563559230357e-07, + "loss": 0.0059, + "num_tokens": 58906255.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5340826511383057, + "sampling/importance_sampling_ratio/mean": 0.9997017979621887, + "sampling/importance_sampling_ratio/min": 0.5940049886703491, + "sampling/sampling_logp_difference/max": 0.5208675861358643, + "sampling/sampling_logp_difference/mean": 0.012924928218126297, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 219.390625, + "completions/mean_terminated_length": 219.390625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.323246031999588, + "epoch": 2.286764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4698833468616563, + "kl": 0.11695494502782822, + "learning_rate": 1.6437725458436725e-07, + "loss": -0.0174, + "num_tokens": 58937480.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6660001277923584, + "sampling/importance_sampling_ratio/mean": 0.9996105432510376, + "sampling/importance_sampling_ratio/min": 0.5110235810279846, + "sampling/sampling_logp_difference/max": 0.6713396310806274, + "sampling/sampling_logp_difference/mean": 0.01686575450003147, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 191.171875, + "completions/mean_terminated_length": 191.171875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.13237124681472778, + "epoch": 2.2879901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.026347006718303, + "kl": 0.06299218535423279, + "learning_rate": 1.6384955486934154e-07, + "loss": 0.0087, + "num_tokens": 58964979.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6238988637924194, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 0.6016948223114014, + "sampling/sampling_logp_difference/max": 0.508004903793335, + "sampling/sampling_logp_difference/mean": 0.008998863399028778, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 202.359375, + "completions/mean_terminated_length": 202.359375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.198871448636055, + "epoch": 2.2892156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06890909192266657, + "kl": 0.06596966087818146, + "learning_rate": 1.633225375184239e-07, + "loss": 0.0007, + "num_tokens": 58995066.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.949591040611267, + "sampling/importance_sampling_ratio/mean": 1.000196099281311, + "sampling/importance_sampling_ratio/min": 0.4717702269554138, + "sampling/sampling_logp_difference/max": 0.7512632608413696, + "sampling/sampling_logp_difference/mean": 0.013516712002456188, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 185.09375, + "completions/mean_terminated_length": 185.09375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.22761297225952148, + "epoch": 2.2904411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04666079638369761, + "kl": 0.06524749845266342, + "learning_rate": 1.6279620360142594e-07, + "loss": 0.0007, + "num_tokens": 59020272.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6944836378097534, + "sampling/importance_sampling_ratio/mean": 1.0000618696212769, + "sampling/importance_sampling_ratio/min": 0.42703554034233093, + "sampling/sampling_logp_difference/max": 0.8508880138397217, + "sampling/sampling_logp_difference/mean": 0.014888203702867031, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 166.828125, + "completions/mean_terminated_length": 166.828125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.25784143805503845, + "epoch": 2.2916666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4682644214754028, + "kl": 0.09768915921449661, + "learning_rate": 1.62270554186772e-07, + "loss": 0.042, + "num_tokens": 59045173.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5245864391326904, + "sampling/importance_sampling_ratio/mean": 0.9994574785232544, + "sampling/importance_sampling_ratio/min": 0.5124073028564453, + "sampling/sampling_logp_difference/max": 0.668635368347168, + "sampling/sampling_logp_difference/mean": 0.014561614021658897, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 218.203125, + "completions/mean_terminated_length": 218.203125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2707987427711487, + "epoch": 2.292892156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0328097232178153, + "kl": 0.09174228459596634, + "learning_rate": 1.6174559034149737e-07, + "loss": -0.0152, + "num_tokens": 59077506.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4908168315887451, + "sampling/importance_sampling_ratio/mean": 0.9995948076248169, + "sampling/importance_sampling_ratio/min": 0.5163322687149048, + "sampling/sampling_logp_difference/max": 0.6610047817230225, + "sampling/sampling_logp_difference/mean": 0.015276968479156494, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 219.9375, + "completions/mean_terminated_length": 219.9375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.2424817532300949, + "epoch": 2.2941176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9338241687311134, + "kl": 0.07496918737888336, + "learning_rate": 1.6122131313124538e-07, + "loss": -0.0297, + "num_tokens": 59110350.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000536441802979, + "sampling/importance_sampling_ratio/min": 0.30553194880485535, + "sampling/sampling_logp_difference/max": 1.1857008934020996, + "sampling/sampling_logp_difference/mean": 0.014451341703534126, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 228.171875, + "completions/mean_terminated_length": 228.171875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.31136059761047363, + "epoch": 2.295343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.016010546987199, + "kl": 0.11611990630626678, + "learning_rate": 1.606977236202654e-07, + "loss": -0.0233, + "num_tokens": 59140937.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6187459230422974, + "sampling/importance_sampling_ratio/mean": 0.9999882578849792, + "sampling/importance_sampling_ratio/min": 0.5052246451377869, + "sampling/sampling_logp_difference/max": 0.6827521324157715, + "sampling/sampling_logp_difference/mean": 0.015141132287681103, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 179.921875, + "completions/mean_terminated_length": 179.921875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2291935831308365, + "epoch": 2.2965686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.435881528477487, + "kl": 0.10278773307800293, + "learning_rate": 1.6017482287141088e-07, + "loss": 0.0051, + "num_tokens": 59167844.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.9217751026153564, + "sampling/importance_sampling_ratio/mean": 0.9994111061096191, + "sampling/importance_sampling_ratio/min": 0.4882408678531647, + "sampling/sampling_logp_difference/max": 0.7169463634490967, + "sampling/sampling_logp_difference/mean": 0.013567205518484116, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 229.34375, + "completions/mean_terminated_length": 229.34375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.31835418939590454, + "epoch": 2.297794117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8047133088676053, + "kl": 0.12889903783798218, + "learning_rate": 1.5965261194613755e-07, + "loss": 0.0596, + "num_tokens": 59197994.0, + "reward": 0.75, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.9140546321868896, + "sampling/importance_sampling_ratio/mean": 1.0005314350128174, + "sampling/importance_sampling_ratio/min": 0.5223632454872131, + "sampling/sampling_logp_difference/max": 0.6493921279907227, + "sampling/sampling_logp_difference/mean": 0.016464825719594955, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 225.65625, + "completions/mean_terminated_length": 225.65625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.35226359963417053, + "epoch": 2.299019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7955400617526363, + "kl": 0.13876409828662872, + "learning_rate": 1.591310919045003e-07, + "loss": -0.0136, + "num_tokens": 59228084.0, + "reward": 0.46875, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6346104145050049, + "sampling/importance_sampling_ratio/mean": 1.0000537633895874, + "sampling/importance_sampling_ratio/min": 0.475128173828125, + "sampling/sampling_logp_difference/max": 0.7441706657409668, + "sampling/sampling_logp_difference/mean": 0.019753381609916687, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 252.6875, + "completions/mean_terminated_length": 252.6875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.27226123213768005, + "epoch": 2.3002450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3958501420863585, + "kl": 0.06177687272429466, + "learning_rate": 1.5861026380515163e-07, + "loss": -0.0232, + "num_tokens": 59259760.0, + "reward": -0.5625, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": -0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.771236538887024, + "sampling/importance_sampling_ratio/mean": 1.0004510879516602, + "sampling/importance_sampling_ratio/min": 0.48663097620010376, + "sampling/sampling_logp_difference/max": 0.7202491760253906, + "sampling/sampling_logp_difference/mean": 0.014677229337394238, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 295.453125, + "completions/mean_terminated_length": 295.453125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.283691942691803, + "epoch": 2.301470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9297130937338883, + "kl": 0.07495174556970596, + "learning_rate": 1.5809012870533995e-07, + "loss": -0.0029, + "num_tokens": 59296525.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7762110233306885, + "sampling/importance_sampling_ratio/mean": 0.9998952150344849, + "sampling/importance_sampling_ratio/min": 0.48336297273635864, + "sampling/sampling_logp_difference/max": 0.7269874811172485, + "sampling/sampling_logp_difference/mean": 0.014811035245656967, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 223.765625, + "completions/mean_terminated_length": 223.765625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.23727725446224213, + "epoch": 2.3026960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8755819010904231, + "kl": 0.06590835005044937, + "learning_rate": 1.575706876609063e-07, + "loss": 0.0146, + "num_tokens": 59332974.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6210137605667114, + "sampling/importance_sampling_ratio/mean": 1.0001112222671509, + "sampling/importance_sampling_ratio/min": 0.2855037748813629, + "sampling/sampling_logp_difference/max": 1.2535001039505005, + "sampling/sampling_logp_difference/mean": 0.013454330153763294, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 263.46875, + "completions/mean_terminated_length": 263.46875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2924033999443054, + "epoch": 2.303921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6013525803897344, + "kl": 0.09922477602958679, + "learning_rate": 1.5705194172628323e-07, + "loss": 0.0914, + "num_tokens": 59372604.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6463134288787842, + "sampling/importance_sampling_ratio/mean": 0.9998865127563477, + "sampling/importance_sampling_ratio/min": 0.624093770980835, + "sampling/sampling_logp_difference/max": 0.4985384941101074, + "sampling/sampling_logp_difference/mean": 0.01429499126970768, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.26727503538131714, + "epoch": 2.3051470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1899354246291431, + "kl": 0.11983922123908997, + "learning_rate": 1.565338919544918e-07, + "loss": 0.0031, + "num_tokens": 59408364.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5612788200378418, + "sampling/importance_sampling_ratio/mean": 0.9999018907546997, + "sampling/importance_sampling_ratio/min": 0.5402755737304688, + "sampling/sampling_logp_difference/max": 0.6156759262084961, + "sampling/sampling_logp_difference/mean": 0.015409836545586586, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 241.25, + "completions/mean_terminated_length": 241.25, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.3307681381702423, + "epoch": 2.306372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9291671873146752, + "kl": 0.13036750257015228, + "learning_rate": 1.5601653939714072e-07, + "loss": -0.0016, + "num_tokens": 59449068.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002212524414062, + "sampling/importance_sampling_ratio/min": 0.3798368573188782, + "sampling/sampling_logp_difference/max": 0.9680135250091553, + "sampling/sampling_logp_difference/mean": 0.016231603920459747, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 179.6875, + "completions/mean_terminated_length": 179.6875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2324448525905609, + "epoch": 2.3075980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04347912088849565, + "kl": 0.06620276719331741, + "learning_rate": 1.5549988510442258e-07, + "loss": 0.0007, + "num_tokens": 59479784.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.826905369758606, + "sampling/importance_sampling_ratio/mean": 1.0006721019744873, + "sampling/importance_sampling_ratio/min": 0.4835604727268219, + "sampling/sampling_logp_difference/max": 0.726578950881958, + "sampling/sampling_logp_difference/mean": 0.013993888162076473, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 216.109375, + "completions/mean_terminated_length": 216.109375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.25666937232017517, + "epoch": 2.3088235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3402557592021203, + "kl": 0.0841069370508194, + "learning_rate": 1.5498393012511285e-07, + "loss": -0.0154, + "num_tokens": 59510287.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995979070663452, + "sampling/importance_sampling_ratio/min": 0.41927775740623474, + "sampling/sampling_logp_difference/max": 0.8762650489807129, + "sampling/sampling_logp_difference/mean": 0.01526167057454586, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 190.859375, + "completions/mean_terminated_length": 190.859375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.2237476408481598, + "epoch": 2.310049019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5917239669146777, + "kl": 0.12006629258394241, + "learning_rate": 1.5446867550656767e-07, + "loss": 0.0063, + "num_tokens": 59536566.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.597978949546814, + "sampling/importance_sampling_ratio/mean": 1.000736951828003, + "sampling/importance_sampling_ratio/min": 0.5315966010093689, + "sampling/sampling_logp_difference/max": 0.6318703889846802, + "sampling/sampling_logp_difference/mean": 0.013502996414899826, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 190.4375, + "completions/mean_terminated_length": 190.4375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.23544517159461975, + "epoch": 2.311274509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3060829679562909, + "kl": 0.08534479141235352, + "learning_rate": 1.5395412229472103e-07, + "loss": -0.0123, + "num_tokens": 59572738.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5971856117248535, + "sampling/importance_sampling_ratio/mean": 0.9998691082000732, + "sampling/importance_sampling_ratio/min": 0.2696689963340759, + "sampling/sampling_logp_difference/max": 1.3105599880218506, + "sampling/sampling_logp_difference/mean": 0.014193758368492126, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 170.8125, + "completions/mean_terminated_length": 170.8125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.19557109475135803, + "epoch": 2.3125, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.403418451183364, + "kl": 0.07334604859352112, + "learning_rate": 1.5344027153408374e-07, + "loss": -0.0423, + "num_tokens": 59611174.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7779353857040405, + "sampling/importance_sampling_ratio/mean": 1.0002080202102661, + "sampling/importance_sampling_ratio/min": 0.45178794860839844, + "sampling/sampling_logp_difference/max": 0.7945423126220703, + "sampling/sampling_logp_difference/mean": 0.013376087881624699, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 201.3125, + "completions/mean_terminated_length": 201.3125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.21097078919410706, + "epoch": 2.313725490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2451653351307033, + "kl": 0.08662876486778259, + "learning_rate": 1.5292712426773973e-07, + "loss": 0.0033, + "num_tokens": 59639706.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997841715812683, + "sampling/importance_sampling_ratio/min": 0.3311176896095276, + "sampling/sampling_logp_difference/max": 1.2261834144592285, + "sampling/sampling_logp_difference/mean": 0.01257825456559658, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 203.359375, + "completions/mean_terminated_length": 203.359375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.17383569478988647, + "epoch": 2.314950980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03679861989617035, + "kl": 0.054777417331933975, + "learning_rate": 1.5241468153734594e-07, + "loss": 0.0005, + "num_tokens": 59678385.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5678654909133911, + "sampling/importance_sampling_ratio/mean": 0.9997862577438354, + "sampling/importance_sampling_ratio/min": 0.6043789982795715, + "sampling/sampling_logp_difference/max": 0.5035538673400879, + "sampling/sampling_logp_difference/mean": 0.0109081557020545, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 159.453125, + "completions/mean_terminated_length": 159.453125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2851375639438629, + "epoch": 2.3161764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09114244957311834, + "kl": 0.17656400799751282, + "learning_rate": 1.5190294438312834e-07, + "loss": 0.0018, + "num_tokens": 59704062.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6007810831069946, + "sampling/importance_sampling_ratio/mean": 0.9995872378349304, + "sampling/importance_sampling_ratio/min": 0.29912951588630676, + "sampling/sampling_logp_difference/max": 1.206878662109375, + "sampling/sampling_logp_difference/mean": 0.015117294155061245, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 147.703125, + "completions/mean_terminated_length": 147.703125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.24446986615657806, + "epoch": 2.3174019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.232726297540076, + "kl": 0.16684526205062866, + "learning_rate": 1.5139191384388094e-07, + "loss": -0.0017, + "num_tokens": 59728075.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004994869232178, + "sampling/importance_sampling_ratio/min": 0.6260403990745544, + "sampling/sampling_logp_difference/max": 0.7444643974304199, + "sampling/sampling_logp_difference/mean": 0.014580121263861656, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 237.421875, + "completions/mean_terminated_length": 237.421875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.15843701362609863, + "epoch": 2.318627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02920837417083462, + "kl": 0.039530105888843536, + "learning_rate": 1.5088159095696362e-07, + "loss": 0.0004, + "num_tokens": 59758310.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6272822618484497, + "sampling/importance_sampling_ratio/mean": 1.0002012252807617, + "sampling/importance_sampling_ratio/min": 0.6125374436378479, + "sampling/sampling_logp_difference/max": 0.490145206451416, + "sampling/sampling_logp_difference/mean": 0.009524786844849586, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 215.734375, + "completions/mean_terminated_length": 215.734375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.1598864495754242, + "epoch": 2.3198529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.331654909496703, + "kl": 0.06764960289001465, + "learning_rate": 1.5037197675829916e-07, + "loss": 0.0389, + "num_tokens": 59793061.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.7586230039596558, + "sampling/importance_sampling_ratio/mean": 1.0005803108215332, + "sampling/importance_sampling_ratio/min": 0.6109176874160767, + "sampling/sampling_logp_difference/max": 0.5645310878753662, + "sampling/sampling_logp_difference/mean": 0.009833090007305145, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 214.625, + "completions/mean_terminated_length": 214.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.24242526292800903, + "epoch": 2.321078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.039675668582658, + "kl": 0.07931549847126007, + "learning_rate": 1.4986307228237267e-07, + "loss": -0.0253, + "num_tokens": 59830637.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6120399236679077, + "sampling/importance_sampling_ratio/mean": 1.0000349283218384, + "sampling/importance_sampling_ratio/min": 0.5739162564277649, + "sampling/sampling_logp_difference/max": 0.5552718639373779, + "sampling/sampling_logp_difference/mean": 0.013587141409516335, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 194.09375, + "completions/mean_terminated_length": 194.09375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.28710392117500305, + "epoch": 2.3223039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3389509409273923, + "kl": 0.09452906250953674, + "learning_rate": 1.4935487856222723e-07, + "loss": -0.0157, + "num_tokens": 59863475.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6049695014953613, + "sampling/importance_sampling_ratio/mean": 0.99993896484375, + "sampling/importance_sampling_ratio/min": 0.30557137727737427, + "sampling/sampling_logp_difference/max": 1.1855719089508057, + "sampling/sampling_logp_difference/mean": 0.015732331201434135, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 216.421875, + "completions/mean_terminated_length": 216.421875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2689727544784546, + "epoch": 2.323529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3257165466554364, + "kl": 0.08077120780944824, + "learning_rate": 1.4884739662946445e-07, + "loss": 0.0103, + "num_tokens": 59898302.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999836266040802, + "sampling/importance_sampling_ratio/min": 0.20456689596176147, + "sampling/sampling_logp_difference/max": 1.586860179901123, + "sampling/sampling_logp_difference/mean": 0.016044773161411285, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 197.765625, + "completions/mean_terminated_length": 197.765625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.24810859560966492, + "epoch": 2.3247549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6701793822172892, + "kl": 0.09056637436151505, + "learning_rate": 1.4834062751424015e-07, + "loss": 0.0104, + "num_tokens": 59931279.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4412437677383423, + "sampling/importance_sampling_ratio/mean": 0.9997663497924805, + "sampling/importance_sampling_ratio/min": 0.3919161856174469, + "sampling/sampling_logp_difference/max": 0.9367072582244873, + "sampling/sampling_logp_difference/mean": 0.01493392325937748, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 228.4375, + "completions/mean_terminated_length": 228.4375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.22810769081115723, + "epoch": 2.325980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026946671836266173, + "kl": 0.04421919584274292, + "learning_rate": 1.478345722452639e-07, + "loss": 0.0004, + "num_tokens": 59962059.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6555689573287964, + "sampling/importance_sampling_ratio/mean": 1.0000898838043213, + "sampling/importance_sampling_ratio/min": 0.3941422700881958, + "sampling/sampling_logp_difference/max": 0.9310433864593506, + "sampling/sampling_logp_difference/mean": 0.013491123914718628, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 183.0, + "completions/mean_terminated_length": 183.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.25881195068359375, + "epoch": 2.327205882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7893686702587364, + "kl": 0.08352837711572647, + "learning_rate": 1.4732923184979562e-07, + "loss": 0.0252, + "num_tokens": 59991067.0, + "reward": 0.75, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4388809204101562, + "sampling/importance_sampling_ratio/mean": 1.0000529289245605, + "sampling/importance_sampling_ratio/min": 0.5265589356422424, + "sampling/sampling_logp_difference/max": 0.6413919925689697, + "sampling/sampling_logp_difference/mean": 0.013214487582445145, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 180.5, + "completions/mean_terminated_length": 180.5, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2558945119380951, + "epoch": 2.3284313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4142773353536864, + "kl": 0.08801299333572388, + "learning_rate": 1.4682460735364422e-07, + "loss": 0.0054, + "num_tokens": 60019115.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5824116468429565, + "sampling/importance_sampling_ratio/mean": 0.9999874234199524, + "sampling/importance_sampling_ratio/min": 0.5375378131866455, + "sampling/sampling_logp_difference/max": 0.6207561492919922, + "sampling/sampling_logp_difference/mean": 0.014198105782270432, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 159.84375, + "completions/mean_terminated_length": 159.84375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.18878209590911865, + "epoch": 2.329656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052108689172164484, + "kl": 0.05853067338466644, + "learning_rate": 1.4632069978116584e-07, + "loss": 0.0005, + "num_tokens": 60046369.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007681846618652, + "sampling/importance_sampling_ratio/min": 0.5236412882804871, + "sampling/sampling_logp_difference/max": 0.9863035678863525, + "sampling/sampling_logp_difference/mean": 0.012843549251556396, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 204.3125, + "completions/mean_terminated_length": 204.3125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.24379225075244904, + "epoch": 2.3308823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10996350721454363, + "kl": 0.09064918756484985, + "learning_rate": 1.4581751015526033e-07, + "loss": 0.001, + "num_tokens": 60076453.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995670318603516, + "sampling/importance_sampling_ratio/min": 0.2191912680864334, + "sampling/sampling_logp_difference/max": 1.517810583114624, + "sampling/sampling_logp_difference/mean": 0.014508318156003952, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 176.625, + "completions/mean_terminated_length": 176.625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.23957127332687378, + "epoch": 2.332107843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5767839733834388, + "kl": 0.12310803681612015, + "learning_rate": 1.4531503949737106e-07, + "loss": 0.0196, + "num_tokens": 60108045.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5311764478683472, + "sampling/importance_sampling_ratio/mean": 1.000032901763916, + "sampling/importance_sampling_ratio/min": 0.13440155982971191, + "sampling/sampling_logp_difference/max": 2.006923198699951, + "sampling/sampling_logp_difference/mean": 0.013991523534059525, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 224.21875, + "completions/mean_terminated_length": 224.21875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.21522961556911469, + "epoch": 2.3333333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6215186486844329, + "kl": 0.10068541765213013, + "learning_rate": 1.4481328882748184e-07, + "loss": 0.0134, + "num_tokens": 60137611.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.757907748222351, + "sampling/importance_sampling_ratio/mean": 1.0003650188446045, + "sampling/importance_sampling_ratio/min": 0.4751411974430084, + "sampling/sampling_logp_difference/max": 0.7441432476043701, + "sampling/sampling_logp_difference/mean": 0.012857379391789436, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 244.828125, + "completions/mean_terminated_length": 244.828125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2058878242969513, + "epoch": 2.3345588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9255789119409037, + "kl": 0.057641707360744476, + "learning_rate": 1.4431225916411455e-07, + "loss": -0.026, + "num_tokens": 60168128.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.9002087116241455, + "sampling/importance_sampling_ratio/mean": 0.9999744296073914, + "sampling/importance_sampling_ratio/min": 0.5323302149772644, + "sampling/sampling_logp_difference/max": 0.6419637203216553, + "sampling/sampling_logp_difference/mean": 0.012306640855967999, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 169.734375, + "completions/mean_terminated_length": 169.734375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.253532350063324, + "epoch": 2.3357843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13391592878802164, + "kl": 0.11147284507751465, + "learning_rate": 1.4381195152432769e-07, + "loss": 0.0011, + "num_tokens": 60201119.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997863173484802, + "sampling/importance_sampling_ratio/min": 0.2298680990934372, + "sampling/sampling_logp_difference/max": 1.4702496528625488, + "sampling/sampling_logp_difference/mean": 0.016630372032523155, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 175.75, + "completions/mean_terminated_length": 175.75, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.16174277663230896, + "epoch": 2.3370098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8753449133823992, + "kl": 0.07965998351573944, + "learning_rate": 1.4331236692371384e-07, + "loss": 0.0643, + "num_tokens": 60226479.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 0.999763548374176, + "sampling/importance_sampling_ratio/min": 0.5685030817985535, + "sampling/sampling_logp_difference/max": 0.5647485256195068, + "sampling/sampling_logp_difference/mean": 0.010479824617505074, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 176.234375, + "completions/mean_terminated_length": 176.234375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.26102542877197266, + "epoch": 2.338235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0885321248888284, + "kl": 0.07639878243207932, + "learning_rate": 1.428135063763985e-07, + "loss": -0.0041, + "num_tokens": 60259150.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6030136346817017, + "sampling/importance_sampling_ratio/mean": 1.0001156330108643, + "sampling/importance_sampling_ratio/min": 0.5586609840393066, + "sampling/sampling_logp_difference/max": 0.5822124481201172, + "sampling/sampling_logp_difference/mean": 0.016059918329119682, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 198.234375, + "completions/mean_terminated_length": 198.234375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.23330505192279816, + "epoch": 2.3394607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8393192967961558, + "kl": 0.07623052597045898, + "learning_rate": 1.4231537089503675e-07, + "loss": -0.0249, + "num_tokens": 60290381.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8180720806121826, + "sampling/importance_sampling_ratio/mean": 0.9994747638702393, + "sampling/importance_sampling_ratio/min": 0.6151342391967773, + "sampling/sampling_logp_difference/max": 0.5977766513824463, + "sampling/sampling_logp_difference/mean": 0.014114055782556534, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 258.6875, + "completions/mean_terminated_length": 258.6875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.27300840616226196, + "epoch": 2.340686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0987861380741548, + "kl": 0.060729093849658966, + "learning_rate": 1.4181796149081194e-07, + "loss": -0.0326, + "num_tokens": 60327785.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.644620418548584, + "sampling/importance_sampling_ratio/mean": 1.0000439882278442, + "sampling/importance_sampling_ratio/min": 0.46871840953826904, + "sampling/sampling_logp_difference/max": 0.7577531337738037, + "sampling/sampling_logp_difference/mean": 0.014157635159790516, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 207.375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.21560019254684448, + "epoch": 2.3419117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.417307635747955, + "kl": 0.12530824542045593, + "learning_rate": 1.4132127917343394e-07, + "loss": 0.0846, + "num_tokens": 60357441.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002789497375488, + "sampling/importance_sampling_ratio/min": 0.49835866689682007, + "sampling/sampling_logp_difference/max": 0.8950405120849609, + "sampling/sampling_logp_difference/mean": 0.012443248182535172, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 190.53125, + "completions/mean_terminated_length": 190.53125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.28844764828681946, + "epoch": 2.343137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7136256914682377, + "kl": 0.09305107593536377, + "learning_rate": 1.4082532495113624e-07, + "loss": 0.0809, + "num_tokens": 60386579.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5612086057662964, + "sampling/importance_sampling_ratio/mean": 0.9995530843734741, + "sampling/importance_sampling_ratio/min": 0.4292699098587036, + "sampling/sampling_logp_difference/max": 0.8456693887710571, + "sampling/sampling_logp_difference/mean": 0.014590008184313774, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 223.234375, + "completions/mean_terminated_length": 223.234375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.19692984223365784, + "epoch": 2.344362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04324036073446108, + "kl": 0.05780654400587082, + "learning_rate": 1.4033009983067452e-07, + "loss": 0.0006, + "num_tokens": 60420386.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7776691913604736, + "sampling/importance_sampling_ratio/mean": 1.0006442070007324, + "sampling/importance_sampling_ratio/min": 0.5536872148513794, + "sampling/sampling_logp_difference/max": 0.5911552906036377, + "sampling/sampling_logp_difference/mean": 0.010026085190474987, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 197.3125, + "completions/mean_terminated_length": 197.3125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.27616995573043823, + "epoch": 2.3455882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.419558332972906, + "kl": 0.1510027050971985, + "learning_rate": 1.398356048173242e-07, + "loss": 0.0202, + "num_tokens": 60453222.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.8812905550003052, + "sampling/importance_sampling_ratio/mean": 0.9990477561950684, + "sampling/importance_sampling_ratio/min": 0.5322301387786865, + "sampling/sampling_logp_difference/max": 0.6319580078125, + "sampling/sampling_logp_difference/mean": 0.016108594834804535, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 221.609375, + "completions/mean_terminated_length": 221.609375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.2639700770378113, + "epoch": 2.346813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0590373478268813, + "kl": 0.08162802457809448, + "learning_rate": 1.3934184091487915e-07, + "loss": -0.0061, + "num_tokens": 60481821.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999839186668396, + "sampling/importance_sampling_ratio/min": 0.5718647837638855, + "sampling/sampling_logp_difference/max": 0.7172503471374512, + "sampling/sampling_logp_difference/mean": 0.012737276032567024, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 164.171875, + "completions/mean_terminated_length": 164.171875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.22160595655441284, + "epoch": 2.3480392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04540446260471624, + "kl": 0.06879030168056488, + "learning_rate": 1.3884880912564873e-07, + "loss": 0.0007, + "num_tokens": 60508328.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9665645360946655, + "sampling/importance_sampling_ratio/mean": 1.000072717666626, + "sampling/importance_sampling_ratio/min": 0.5566588044166565, + "sampling/sampling_logp_difference/max": 0.6762881278991699, + "sampling/sampling_logp_difference/mean": 0.012744169682264328, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 180.5625, + "completions/mean_terminated_length": 180.5625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.22264420986175537, + "epoch": 2.349264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050170777608224214, + "kl": 0.07316240668296814, + "learning_rate": 1.3835651045045598e-07, + "loss": 0.0007, + "num_tokens": 60533996.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4916131496429443, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 0.5367288589477539, + "sampling/sampling_logp_difference/max": 0.6222622394561768, + "sampling/sampling_logp_difference/mean": 0.012823224067687988, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.22194664180278778, + "epoch": 2.3504901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7970809259858578, + "kl": 0.0707157701253891, + "learning_rate": 1.3786494588863633e-07, + "loss": 0.0615, + "num_tokens": 60574804.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9481863975524902, + "sampling/importance_sampling_ratio/mean": 0.999103307723999, + "sampling/importance_sampling_ratio/min": 0.5347524881362915, + "sampling/sampling_logp_difference/max": 0.6668989658355713, + "sampling/sampling_logp_difference/mean": 0.012659601867198944, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 222.296875, + "completions/mean_terminated_length": 222.296875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.2153092473745346, + "epoch": 2.3517156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.214353351093163, + "kl": 0.06867530941963196, + "learning_rate": 1.3737411643803448e-07, + "loss": -0.0732, + "num_tokens": 60607047.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.8346688747406006, + "sampling/importance_sampling_ratio/mean": 0.9999586343765259, + "sampling/importance_sampling_ratio/min": 0.6097875833511353, + "sampling/sampling_logp_difference/max": 0.6068639755249023, + "sampling/sampling_logp_difference/mean": 0.013082252815365791, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 234.03125, + "completions/mean_terminated_length": 234.03125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2702324390411377, + "epoch": 2.3529411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0443638998213056, + "kl": 0.06359662860631943, + "learning_rate": 1.368840230950035e-07, + "loss": 0.0009, + "num_tokens": 60641929.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5788229703903198, + "sampling/importance_sampling_ratio/mean": 0.9997637867927551, + "sampling/importance_sampling_ratio/min": 0.5266174674034119, + "sampling/sampling_logp_difference/max": 0.6412808895111084, + "sampling/sampling_logp_difference/mean": 0.015107502229511738, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 179.140625, + "completions/mean_terminated_length": 179.140625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.23998671770095825, + "epoch": 2.3541666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0589424191865848, + "kl": 0.060683056712150574, + "learning_rate": 1.3639466685440132e-07, + "loss": 0.0006, + "num_tokens": 60671042.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000229001045227, + "sampling/importance_sampling_ratio/min": 0.5057139992713928, + "sampling/sampling_logp_difference/max": 0.7327079772949219, + "sampling/sampling_logp_difference/mean": 0.01452082209289074, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 154.0, + "completions/mean_terminated_length": 154.0, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.22048285603523254, + "epoch": 2.355392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050767824219279896, + "kl": 0.07472100853919983, + "learning_rate": 1.3590604870959043e-07, + "loss": 0.0008, + "num_tokens": 60696994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9099973440170288, + "sampling/importance_sampling_ratio/mean": 0.9994252920150757, + "sampling/importance_sampling_ratio/min": 0.29913097620010376, + "sampling/sampling_logp_difference/max": 1.2068737745285034, + "sampling/sampling_logp_difference/mean": 0.013492463156580925, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 217.3125, + "completions/mean_terminated_length": 217.3125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.23784899711608887, + "epoch": 2.3566176470588234, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6033230741854918, + "kl": 0.07405094802379608, + "learning_rate": 1.3541816965243462e-07, + "loss": -0.0331, + "num_tokens": 60732646.0, + "reward": 0.125, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996098279953003, + "sampling/importance_sampling_ratio/min": 0.3956761956214905, + "sampling/sampling_logp_difference/max": 0.9771990776062012, + "sampling/sampling_logp_difference/mean": 0.013517213985323906, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 145.296875, + "completions/mean_terminated_length": 145.296875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.21970178186893463, + "epoch": 2.357843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0700749236722222, + "kl": 0.082846499979496, + "learning_rate": 1.3493103067329737e-07, + "loss": 0.0008, + "num_tokens": 60760249.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7992300987243652, + "sampling/importance_sampling_ratio/mean": 1.000280499458313, + "sampling/importance_sampling_ratio/min": 0.5525192618370056, + "sampling/sampling_logp_difference/max": 0.5932669639587402, + "sampling/sampling_logp_difference/mean": 0.01530250534415245, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 271.921875, + "completions/mean_terminated_length": 271.921875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.21385936439037323, + "epoch": 2.3590686274509802, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5391506686481062, + "kl": 0.06280796229839325, + "learning_rate": 1.3444463276104012e-07, + "loss": 0.0592, + "num_tokens": 60796628.0, + "reward": 0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.8644733428955078, + "sampling/importance_sampling_ratio/mean": 1.0002459287643433, + "sampling/importance_sampling_ratio/min": 0.537388801574707, + "sampling/sampling_logp_difference/max": 0.622978687286377, + "sampling/sampling_logp_difference/mean": 0.014365228824317455, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 200.84375, + "completions/mean_terminated_length": 200.84375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2779545783996582, + "epoch": 2.360294117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.80709516557649, + "kl": 0.09816673398017883, + "learning_rate": 1.3395897690301966e-07, + "loss": 0.171, + "num_tokens": 60829978.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.7608885765075684, + "sampling/importance_sampling_ratio/mean": 1.000191330909729, + "sampling/importance_sampling_ratio/min": 0.5771064162254333, + "sampling/sampling_logp_difference/max": 0.5658185482025146, + "sampling/sampling_logp_difference/mean": 0.014423665590584278, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 232.703125, + "completions/mean_terminated_length": 232.703125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.21351173520088196, + "epoch": 2.361519607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.970063551926708, + "kl": 0.06985197961330414, + "learning_rate": 1.3347406408508694e-07, + "loss": 0.0851, + "num_tokens": 60860887.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6141871213912964, + "sampling/importance_sampling_ratio/mean": 1.0003697872161865, + "sampling/importance_sampling_ratio/min": 0.4937157928943634, + "sampling/sampling_logp_difference/max": 0.7057952880859375, + "sampling/sampling_logp_difference/mean": 0.011303408071398735, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 238.671875, + "completions/mean_terminated_length": 238.671875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.23693378269672394, + "epoch": 2.3627450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4764931315312262, + "kl": 0.062237031757831573, + "learning_rate": 1.3298989529158378e-07, + "loss": 0.0926, + "num_tokens": 60896994.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5772426128387451, + "sampling/importance_sampling_ratio/mean": 0.9998999834060669, + "sampling/importance_sampling_ratio/min": 0.5110949873924255, + "sampling/sampling_logp_difference/max": 0.6711997985839844, + "sampling/sampling_logp_difference/mean": 0.012698430567979813, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 148.5625, + "completions/mean_terminated_length": 148.5625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.1978602111339569, + "epoch": 2.363970588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4896909152778242, + "kl": 0.09970251470804214, + "learning_rate": 1.325064715053425e-07, + "loss": 0.0155, + "num_tokens": 60920598.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.581521987915039, + "sampling/importance_sampling_ratio/mean": 1.0005525350570679, + "sampling/importance_sampling_ratio/min": 0.5561189651489258, + "sampling/sampling_logp_difference/max": 0.5867730379104614, + "sampling/sampling_logp_difference/mean": 0.012599577195942402, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 222.046875, + "completions/mean_terminated_length": 222.046875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.29237282276153564, + "epoch": 2.3651960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.298288706391106, + "kl": 0.08031923323869705, + "learning_rate": 1.320237937076825e-07, + "loss": 0.0022, + "num_tokens": 60953257.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.65193510055542, + "sampling/importance_sampling_ratio/mean": 0.999497652053833, + "sampling/importance_sampling_ratio/min": 0.4843113422393799, + "sampling/sampling_logp_difference/max": 0.725027322769165, + "sampling/sampling_logp_difference/mean": 0.015679839998483658, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 185.296875, + "completions/mean_terminated_length": 185.296875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2450130134820938, + "epoch": 2.366421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07191566437545724, + "kl": 0.08761993050575256, + "learning_rate": 1.3154186287840946e-07, + "loss": 0.0009, + "num_tokens": 60984252.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8617053031921387, + "sampling/importance_sampling_ratio/mean": 1.0000516176223755, + "sampling/importance_sampling_ratio/min": 0.4116455316543579, + "sampling/sampling_logp_difference/max": 0.8875926733016968, + "sampling/sampling_logp_difference/mean": 0.015412520617246628, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 191.0625, + "completions/mean_terminated_length": 191.0625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2743494510650635, + "epoch": 2.3676470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5918629660933312, + "kl": 0.112302765250206, + "learning_rate": 1.310606799958122e-07, + "loss": 0.0379, + "num_tokens": 61018288.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6630561351776123, + "sampling/importance_sampling_ratio/mean": 1.0001001358032227, + "sampling/importance_sampling_ratio/min": 0.6094446778297424, + "sampling/sampling_logp_difference/max": 0.5086569786071777, + "sampling/sampling_logp_difference/mean": 0.015683766454458237, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 227.078125, + "completions/mean_terminated_length": 227.078125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.27433329820632935, + "epoch": 2.368872549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.584208504819516, + "kl": 0.06815879046916962, + "learning_rate": 1.305802460366615e-07, + "loss": 0.0219, + "num_tokens": 61057557.0, + "reward": 0.15625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.897632122039795, + "sampling/importance_sampling_ratio/mean": 0.9998785257339478, + "sampling/importance_sampling_ratio/min": 0.5910902619361877, + "sampling/sampling_logp_difference/max": 0.6406068801879883, + "sampling/sampling_logp_difference/mean": 0.01572701334953308, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 156.140625, + "completions/mean_terminated_length": 156.140625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.23684397339820862, + "epoch": 2.3700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2777498029803074, + "kl": 0.08773821592330933, + "learning_rate": 1.3010056197620812e-07, + "loss": -0.0014, + "num_tokens": 61087326.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.8796398639678955, + "sampling/importance_sampling_ratio/mean": 1.0005260705947876, + "sampling/importance_sampling_ratio/min": 0.3654533624649048, + "sampling/sampling_logp_difference/max": 1.0066165924072266, + "sampling/sampling_logp_difference/mean": 0.01609150506556034, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 251.4375, + "completions/mean_terminated_length": 251.4375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.33900707960128784, + "epoch": 2.3713235294117645, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.074405565419832, + "kl": 0.1103014275431633, + "learning_rate": 1.2962162878817985e-07, + "loss": 0.0191, + "num_tokens": 61125354.0, + "reward": 0.0625, + "reward_std": 0.5501632690429688, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5916136503219604, + "sampling/importance_sampling_ratio/mean": 1.0007879734039307, + "sampling/importance_sampling_ratio/min": 0.60539710521698, + "sampling/sampling_logp_difference/max": 0.5018706321716309, + "sampling/sampling_logp_difference/mean": 0.017022768035531044, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 242.453125, + "completions/mean_terminated_length": 242.453125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.23958539962768555, + "epoch": 2.372549019607843, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5269838185566615, + "kl": 0.08082319051027298, + "learning_rate": 1.2914344744478112e-07, + "loss": 0.0932, + "num_tokens": 61159911.0, + "reward": 0.90625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.7728302478790283, + "sampling/importance_sampling_ratio/mean": 1.0002799034118652, + "sampling/importance_sampling_ratio/min": 0.6110666990280151, + "sampling/sampling_logp_difference/max": 0.5725772380828857, + "sampling/sampling_logp_difference/mean": 0.013866308145225048, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 203.890625, + "completions/mean_terminated_length": 203.890625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.22986426949501038, + "epoch": 2.373774509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.042505482485899, + "kl": 0.08157625794410706, + "learning_rate": 1.2866601891668942e-07, + "loss": 0.0314, + "num_tokens": 61189232.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5656235218048096, + "sampling/importance_sampling_ratio/mean": 1.000196099281311, + "sampling/importance_sampling_ratio/min": 0.3692629337310791, + "sampling/sampling_logp_difference/max": 0.996246337890625, + "sampling/sampling_logp_difference/mean": 0.013883614912629128, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 176.28125, + "completions/mean_terminated_length": 176.28125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.1920793056488037, + "epoch": 2.375, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3512202642752835, + "kl": 0.09101974219083786, + "learning_rate": 1.2818934417305477e-07, + "loss": 0.0611, + "num_tokens": 61217746.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6356041431427002, + "sampling/importance_sampling_ratio/mean": 1.0001342296600342, + "sampling/importance_sampling_ratio/min": 0.18803855776786804, + "sampling/sampling_logp_difference/max": 1.6711082458496094, + "sampling/sampling_logp_difference/mean": 0.012220809236168861, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 189.171875, + "completions/mean_terminated_length": 189.171875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.22339913249015808, + "epoch": 2.376225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3587392805565413, + "kl": 0.07989152520895004, + "learning_rate": 1.2771342418149656e-07, + "loss": -0.0014, + "num_tokens": 61250909.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.753654956817627, + "sampling/importance_sampling_ratio/mean": 1.00027334690094, + "sampling/importance_sampling_ratio/min": 0.2822793126106262, + "sampling/sampling_logp_difference/max": 1.2648582458496094, + "sampling/sampling_logp_difference/mean": 0.015397697687149048, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 212.09375, + "completions/mean_terminated_length": 212.09375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.333929181098938, + "epoch": 2.377450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6350576499593368, + "kl": 0.15975189208984375, + "learning_rate": 1.2723825990810204e-07, + "loss": 0.0341, + "num_tokens": 61283187.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002503395080566, + "sampling/importance_sampling_ratio/min": 0.6097145676612854, + "sampling/sampling_logp_difference/max": 0.7700153589248657, + "sampling/sampling_logp_difference/mean": 0.01582321710884571, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 200.515625, + "completions/mean_terminated_length": 200.515625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.1840367615222931, + "epoch": 2.3786764705882355, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.836295613462871, + "kl": 0.06945012509822845, + "learning_rate": 1.2676385231742494e-07, + "loss": 0.0797, + "num_tokens": 61312564.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.8142162561416626, + "sampling/importance_sampling_ratio/mean": 1.0005759000778198, + "sampling/importance_sampling_ratio/min": 0.4323919415473938, + "sampling/sampling_logp_difference/max": 0.8384228944778442, + "sampling/sampling_logp_difference/mean": 0.011091177351772785, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 211.75, + "completions/mean_terminated_length": 211.75, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.19584491848945618, + "epoch": 2.3799019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3394160207249557, + "kl": 0.05044228583574295, + "learning_rate": 1.262902023724824e-07, + "loss": -0.0549, + "num_tokens": 61342788.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5584990978240967, + "sampling/importance_sampling_ratio/mean": 0.9999395608901978, + "sampling/importance_sampling_ratio/min": 0.5607408285140991, + "sampling/sampling_logp_difference/max": 0.5784964561462402, + "sampling/sampling_logp_difference/mean": 0.01116795465350151, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 226.15625, + "completions/mean_terminated_length": 226.15625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3171435296535492, + "epoch": 2.381127450980392, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7356636484164167, + "kl": 0.12276682257652283, + "learning_rate": 1.258173110347538e-07, + "loss": -0.0132, + "num_tokens": 61386430.0, + "reward": -0.21875, + "reward_std": 0.7297805547714233, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.7008626461029053, + "sampling/importance_sampling_ratio/mean": 1.0000245571136475, + "sampling/importance_sampling_ratio/min": 0.48895463347435, + "sampling/sampling_logp_difference/max": 0.7154855728149414, + "sampling/sampling_logp_difference/mean": 0.01608789712190628, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 205.171875, + "completions/mean_terminated_length": 205.171875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2562256455421448, + "epoch": 2.3823529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7410033887855307, + "kl": 0.11961342394351959, + "learning_rate": 1.253451792641785e-07, + "loss": 0.0057, + "num_tokens": 61418857.0, + "reward": 0.0, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.702419638633728, + "sampling/importance_sampling_ratio/mean": 0.9998294711112976, + "sampling/importance_sampling_ratio/min": 0.5145463347434998, + "sampling/sampling_logp_difference/max": 0.6644695997238159, + "sampling/sampling_logp_difference/mean": 0.015101276338100433, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3039652705192566, + "epoch": 2.383578431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.344275615023439, + "kl": 0.10801234841346741, + "learning_rate": 1.248738080191543e-07, + "loss": 0.0113, + "num_tokens": 61446689.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.911575436592102, + "sampling/importance_sampling_ratio/mean": 1.0003383159637451, + "sampling/importance_sampling_ratio/min": 0.5024508833885193, + "sampling/sampling_logp_difference/max": 0.6882574558258057, + "sampling/sampling_logp_difference/mean": 0.016542499884963036, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 173.09375, + "completions/mean_terminated_length": 173.09375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.24207086861133575, + "epoch": 2.3848039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06780617246457922, + "kl": 0.10478676110506058, + "learning_rate": 1.244031982565349e-07, + "loss": 0.0012, + "num_tokens": 61472359.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4732166528701782, + "sampling/importance_sampling_ratio/mean": 0.9993559122085571, + "sampling/importance_sampling_ratio/min": 0.5910095572471619, + "sampling/sampling_logp_difference/max": 0.5259230136871338, + "sampling/sampling_logp_difference/mean": 0.013632368296384811, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 189.171875, + "completions/mean_terminated_length": 189.171875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.31813693046569824, + "epoch": 2.386029411764706, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.458377801973784, + "kl": 0.11870501935482025, + "learning_rate": 1.239333509316281e-07, + "loss": -0.0297, + "num_tokens": 61505122.0, + "reward": 0.40625, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.9482307434082031, + "sampling/importance_sampling_ratio/mean": 0.9993807077407837, + "sampling/importance_sampling_ratio/min": 0.5460110306739807, + "sampling/sampling_logp_difference/max": 0.6669216156005859, + "sampling/sampling_logp_difference/mean": 0.016893664374947548, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 200.96875, + "completions/mean_terminated_length": 200.96875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.21835170686244965, + "epoch": 2.3872549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3072731399420443, + "kl": 0.07475827634334564, + "learning_rate": 1.2346426699819456e-07, + "loss": 0.0075, + "num_tokens": 61536496.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6572539806365967, + "sampling/importance_sampling_ratio/mean": 1.000198483467102, + "sampling/importance_sampling_ratio/min": 0.5988007187843323, + "sampling/sampling_logp_difference/max": 0.5128264427185059, + "sampling/sampling_logp_difference/mean": 0.0124274967238307, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 192.59375, + "completions/mean_terminated_length": 192.59375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.1802973747253418, + "epoch": 2.388480392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04346566312788138, + "kl": 0.062366824597120285, + "learning_rate": 1.2299594740844476e-07, + "loss": 0.0006, + "num_tokens": 61565702.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8180396556854248, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 0.6013502478599548, + "sampling/sampling_logp_difference/max": 0.5977587699890137, + "sampling/sampling_logp_difference/mean": 0.011643131263554096, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 160.71875, + "completions/mean_terminated_length": 160.71875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.21568229794502258, + "epoch": 2.389705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07281075137744002, + "kl": 0.10016850382089615, + "learning_rate": 1.225283931130378e-07, + "loss": 0.001, + "num_tokens": 61590868.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4943758249282837, + "sampling/importance_sampling_ratio/mean": 0.9999530911445618, + "sampling/importance_sampling_ratio/min": 0.48211976885795593, + "sampling/sampling_logp_difference/max": 0.7295627593994141, + "sampling/sampling_logp_difference/mean": 0.013741142116487026, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 173.75, + "completions/mean_terminated_length": 173.75, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.23186248540878296, + "epoch": 2.3909313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042213109083424606, + "kl": 0.06597447395324707, + "learning_rate": 1.220616050610791e-07, + "loss": 0.0006, + "num_tokens": 61618916.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997356534004211, + "sampling/importance_sampling_ratio/min": 0.5676605105400085, + "sampling/sampling_logp_difference/max": 0.8005461692810059, + "sampling/sampling_logp_difference/mean": 0.014332741498947144, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 190.359375, + "completions/mean_terminated_length": 190.359375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.24763555824756622, + "epoch": 2.392156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3624870551913222, + "kl": 0.07363063097000122, + "learning_rate": 1.2159558420011905e-07, + "loss": 0.0208, + "num_tokens": 61649259.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6884815692901611, + "sampling/importance_sampling_ratio/mean": 1.000169277191162, + "sampling/importance_sampling_ratio/min": 0.3574312925338745, + "sampling/sampling_logp_difference/max": 1.0288121700286865, + "sampling/sampling_logp_difference/mean": 0.014056330546736717, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 240.234375, + "completions/mean_terminated_length": 240.234375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.20466625690460205, + "epoch": 2.3933823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.442820055251828, + "kl": 0.09523437917232513, + "learning_rate": 1.2113033147615071e-07, + "loss": 0.0382, + "num_tokens": 61679434.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002615451812744, + "sampling/importance_sampling_ratio/min": 0.38819387555122375, + "sampling/sampling_logp_difference/max": 1.252941608428955, + "sampling/sampling_logp_difference/mean": 0.01280001550912857, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 187.640625, + "completions/mean_terminated_length": 187.640625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.2240035980939865, + "epoch": 2.394607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2719615435687175, + "kl": 0.05534994229674339, + "learning_rate": 1.206658478336071e-07, + "loss": 0.0253, + "num_tokens": 61708291.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.601393699645996, + "sampling/importance_sampling_ratio/mean": 1.0005192756652832, + "sampling/importance_sampling_ratio/min": 0.6298378109931946, + "sampling/sampling_logp_difference/max": 0.4708743095397949, + "sampling/sampling_logp_difference/mean": 0.012002687901258469, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 189.609375, + "completions/mean_terminated_length": 189.609375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2199474275112152, + "epoch": 2.3958333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043589706437910984, + "kl": 0.07419893145561218, + "learning_rate": 1.2020213421536103e-07, + "loss": 0.0007, + "num_tokens": 61737386.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999531090259552, + "sampling/importance_sampling_ratio/min": 0.5294254422187805, + "sampling/sampling_logp_difference/max": 0.7695995569229126, + "sampling/sampling_logp_difference/mean": 0.013445645570755005, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 184.671875, + "completions/mean_terminated_length": 184.671875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.309781551361084, + "epoch": 2.3970588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05492677060050852, + "kl": 0.08925427496433258, + "learning_rate": 1.1973919156272138e-07, + "loss": 0.0009, + "num_tokens": 61772485.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7128942012786865, + "sampling/importance_sampling_ratio/mean": 1.0004339218139648, + "sampling/importance_sampling_ratio/min": 0.48819631338119507, + "sampling/sampling_logp_difference/max": 0.7170376777648926, + "sampling/sampling_logp_difference/mean": 0.017541082575917244, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 178.140625, + "completions/mean_terminated_length": 178.140625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.27200642228126526, + "epoch": 2.3982843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6495497483034853, + "kl": 0.09907713532447815, + "learning_rate": 1.1927702081543278e-07, + "loss": 0.0149, + "num_tokens": 61802238.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6173079013824463, + "sampling/importance_sampling_ratio/mean": 1.0000338554382324, + "sampling/importance_sampling_ratio/min": 0.4597283601760864, + "sampling/sampling_logp_difference/max": 0.777119517326355, + "sampling/sampling_logp_difference/mean": 0.01572125218808651, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 202.1875, + "completions/mean_terminated_length": 202.1875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2908550202846527, + "epoch": 2.3995098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9769531666950955, + "kl": 0.08010758459568024, + "learning_rate": 1.188156229116724e-07, + "loss": 0.0103, + "num_tokens": 61843258.0, + "reward": -0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6162408590316772, + "sampling/importance_sampling_ratio/mean": 1.000512719154358, + "sampling/importance_sampling_ratio/min": 0.5101743936538696, + "sampling/sampling_logp_difference/max": 0.6730027198791504, + "sampling/sampling_logp_difference/mean": 0.01668713241815567, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 180.828125, + "completions/mean_terminated_length": 180.828125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.21159318089485168, + "epoch": 2.400735294117647, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9940042567980671, + "kl": 0.06886664032936096, + "learning_rate": 1.1835499878804861e-07, + "loss": 0.0111, + "num_tokens": 61873087.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8365287780761719, + "sampling/importance_sampling_ratio/mean": 0.999881386756897, + "sampling/importance_sampling_ratio/min": 0.4955132007598877, + "sampling/sampling_logp_difference/max": 0.7021613121032715, + "sampling/sampling_logp_difference/mean": 0.012495124712586403, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 139.953125, + "completions/mean_terminated_length": 139.953125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2683044672012329, + "epoch": 2.4019607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09810569386867438, + "kl": 0.09765708446502686, + "learning_rate": 1.1789514937959965e-07, + "loss": 0.001, + "num_tokens": 61897340.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6773133277893066, + "sampling/importance_sampling_ratio/mean": 0.9999996423721313, + "sampling/importance_sampling_ratio/min": 0.5363419651985168, + "sampling/sampling_logp_difference/max": 0.6229833364486694, + "sampling/sampling_logp_difference/mean": 0.016239367425441742, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 196.03125, + "completions/mean_terminated_length": 196.03125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.34146174788475037, + "epoch": 2.403186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1141920873923354, + "kl": 0.14285296201705933, + "learning_rate": 1.1743607561979013e-07, + "loss": 0.0053, + "num_tokens": 61929310.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4234881401062012, + "sampling/importance_sampling_ratio/mean": 0.9995550513267517, + "sampling/importance_sampling_ratio/min": 0.5124226212501526, + "sampling/sampling_logp_difference/max": 0.6686055660247803, + "sampling/sampling_logp_difference/mean": 0.017412405461072922, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 148.96875, + "completions/mean_terminated_length": 148.96875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.21372148394584656, + "epoch": 2.4044117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0202041764371432, + "kl": 0.10127365589141846, + "learning_rate": 1.1697777844051104e-07, + "loss": 0.0277, + "num_tokens": 61955708.0, + "reward": 0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5071592330932617, + "sampling/importance_sampling_ratio/mean": 1.000234842300415, + "sampling/importance_sampling_ratio/min": 0.47133302688598633, + "sampling/sampling_logp_difference/max": 0.752190351486206, + "sampling/sampling_logp_difference/mean": 0.01135617308318615, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 176.421875, + "completions/mean_terminated_length": 176.421875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.1962416172027588, + "epoch": 2.405637254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03121125047462608, + "kl": 0.04818423464894295, + "learning_rate": 1.1652025877207644e-07, + "loss": 0.0005, + "num_tokens": 61983719.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6769741773605347, + "sampling/importance_sampling_ratio/mean": 1.00025475025177, + "sampling/importance_sampling_ratio/min": 0.5177282094955444, + "sampling/sampling_logp_difference/max": 0.6583049297332764, + "sampling/sampling_logp_difference/mean": 0.012465772219002247, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 153.203125, + "completions/mean_terminated_length": 153.203125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.23137956857681274, + "epoch": 2.406862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0740601484478167, + "kl": 0.08045706152915955, + "learning_rate": 1.1606351754322247e-07, + "loss": 0.0009, + "num_tokens": 62008612.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8397560119628906, + "sampling/importance_sampling_ratio/mean": 1.000663161277771, + "sampling/importance_sampling_ratio/min": 0.5463835597038269, + "sampling/sampling_logp_difference/max": 0.6096329689025879, + "sampling/sampling_logp_difference/mean": 0.014718655496835709, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 171.984375, + "completions/mean_terminated_length": 171.984375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3546157479286194, + "epoch": 2.4080882352941178, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1880953904965823, + "kl": 0.11963890492916107, + "learning_rate": 1.156075556811048e-07, + "loss": -0.031, + "num_tokens": 62039075.0, + "reward": -0.125, + "reward_std": 0.5738953948020935, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4352010488510132, + "sampling/importance_sampling_ratio/mean": 0.999981164932251, + "sampling/importance_sampling_ratio/min": 0.5505171418190002, + "sampling/sampling_logp_difference/max": 0.5968972444534302, + "sampling/sampling_logp_difference/mean": 0.017265722155570984, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 151.484375, + "completions/mean_terminated_length": 151.484375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 0.22168831527233124, + "epoch": 2.409313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6869881388829138, + "kl": 0.10283079743385315, + "learning_rate": 1.1515237411129697e-07, + "loss": 0.0652, + "num_tokens": 62069026.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999632716178894, + "sampling/importance_sampling_ratio/min": 0.5867922902107239, + "sampling/sampling_logp_difference/max": 0.7418670654296875, + "sampling/sampling_logp_difference/mean": 0.012990560382604599, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 163.6875, + "completions/mean_terminated_length": 163.6875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.21114423871040344, + "epoch": 2.4105392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.65379610952179, + "kl": 0.12189424782991409, + "learning_rate": 1.1469797375778901e-07, + "loss": 0.0623, + "num_tokens": 62092302.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6554591655731201, + "sampling/importance_sampling_ratio/mean": 1.0000386238098145, + "sampling/importance_sampling_ratio/min": 0.6058178544044495, + "sampling/sampling_logp_difference/max": 0.5040783882141113, + "sampling/sampling_logp_difference/mean": 0.012203224934637547, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 208.828125, + "completions/mean_terminated_length": 208.828125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.2707853615283966, + "epoch": 2.411764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4753170204784647, + "kl": 0.07635916769504547, + "learning_rate": 1.1424435554298473e-07, + "loss": -0.0148, + "num_tokens": 62127123.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7422895431518555, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 0.5663594603538513, + "sampling/sampling_logp_difference/max": 0.5685262680053711, + "sampling/sampling_logp_difference/mean": 0.014813482761383057, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 206.140625, + "completions/mean_terminated_length": 206.140625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.30891549587249756, + "epoch": 2.4129901960784315, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1459926530914273, + "kl": 0.0987582579255104, + "learning_rate": 1.1379152038770029e-07, + "loss": 0.0225, + "num_tokens": 62161564.0, + "reward": 0.6875, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4569240808486938, + "sampling/importance_sampling_ratio/mean": 0.9999404549598694, + "sampling/importance_sampling_ratio/min": 0.49131664633750916, + "sampling/sampling_logp_difference/max": 0.7106664180755615, + "sampling/sampling_logp_difference/mean": 0.016402438282966614, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 197.53125, + "completions/mean_terminated_length": 197.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.2572060227394104, + "epoch": 2.4142156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.257642491145384, + "kl": 0.1277352273464203, + "learning_rate": 1.1333946921116234e-07, + "loss": -0.019, + "num_tokens": 62190302.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.5978838205337524, + "sampling/importance_sampling_ratio/mean": 1.0002050399780273, + "sampling/importance_sampling_ratio/min": 0.6207980513572693, + "sampling/sampling_logp_difference/max": 0.4767494201660156, + "sampling/sampling_logp_difference/mean": 0.0142198596149683, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 200.046875, + "completions/mean_terminated_length": 200.046875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.30708110332489014, + "epoch": 2.4154411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6849725653324623, + "kl": 0.07924254238605499, + "learning_rate": 1.1288820293100637e-07, + "loss": -0.021, + "num_tokens": 62222513.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9991940259933472, + "sampling/importance_sampling_ratio/min": 0.38363999128341675, + "sampling/sampling_logp_difference/max": 1.9116621017456055, + "sampling/sampling_logp_difference/mean": 0.016897523775696754, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 169.421875, + "completions/mean_terminated_length": 169.421875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2777869701385498, + "epoch": 2.4166666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.049756210521659, + "kl": 0.09561365842819214, + "learning_rate": 1.1243772246327415e-07, + "loss": -0.0874, + "num_tokens": 62252716.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003769397735596, + "sampling/importance_sampling_ratio/min": 0.5252593159675598, + "sampling/sampling_logp_difference/max": 0.7423210144042969, + "sampling/sampling_logp_difference/mean": 0.01625244691967964, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 255.96875, + "completions/mean_terminated_length": 255.96875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.32085007429122925, + "epoch": 2.417892156862745, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8697659940414395, + "kl": 0.09609529376029968, + "learning_rate": 1.1198802872241242e-07, + "loss": -0.0031, + "num_tokens": 62288458.0, + "reward": 0.5, + "reward_std": 0.6663130521774292, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5612561702728271, + "sampling/importance_sampling_ratio/mean": 0.9996238946914673, + "sampling/importance_sampling_ratio/min": 0.3742508292198181, + "sampling/sampling_logp_difference/max": 0.9828290939331055, + "sampling/sampling_logp_difference/mean": 0.01732058823108673, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 145.78125, + "completions/mean_terminated_length": 145.78125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.19306042790412903, + "epoch": 2.4191176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07594578510975517, + "kl": 0.09978260099887848, + "learning_rate": 1.1153912262127119e-07, + "loss": 0.0011, + "num_tokens": 62317868.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.910028338432312, + "sampling/importance_sampling_ratio/mean": 1.0001709461212158, + "sampling/importance_sampling_ratio/min": 0.5480507016181946, + "sampling/sampling_logp_difference/max": 0.647118091583252, + "sampling/sampling_logp_difference/mean": 0.014451880939304829, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 198.6875, + "completions/mean_terminated_length": 198.6875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.20174993574619293, + "epoch": 2.420343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08789469737367238, + "kl": 0.09293520450592041, + "learning_rate": 1.1109100507110131e-07, + "loss": 0.0008, + "num_tokens": 62344376.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071581602096558, + "sampling/importance_sampling_ratio/mean": 1.0001535415649414, + "sampling/importance_sampling_ratio/min": 0.47808966040611267, + "sampling/sampling_logp_difference/max": 0.7379570007324219, + "sampling/sampling_logp_difference/mean": 0.011941138654947281, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 169.828125, + "completions/mean_terminated_length": 169.828125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3261312246322632, + "epoch": 2.4215686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3812259383358159, + "kl": 0.08990463614463806, + "learning_rate": 1.1064367698155303e-07, + "loss": 0.0094, + "num_tokens": 62377165.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6962614059448242, + "sampling/importance_sampling_ratio/mean": 0.9996572136878967, + "sampling/importance_sampling_ratio/min": 0.6097140312194824, + "sampling/sampling_logp_difference/max": 0.5284266471862793, + "sampling/sampling_logp_difference/mean": 0.016119036823511124, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 165.265625, + "completions/mean_terminated_length": 165.265625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.22535020112991333, + "epoch": 2.422794117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.017966091119861, + "kl": 0.09670273959636688, + "learning_rate": 1.1019713926067392e-07, + "loss": 0.0178, + "num_tokens": 62405614.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000312328338623, + "sampling/importance_sampling_ratio/min": 0.44563791155815125, + "sampling/sampling_logp_difference/max": 1.1860003471374512, + "sampling/sampling_logp_difference/mean": 0.014435198158025742, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 277.328125, + "completions/mean_terminated_length": 277.328125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.24867111444473267, + "epoch": 2.424019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2647538538960692, + "kl": 0.10145062953233719, + "learning_rate": 1.0975139281490747e-07, + "loss": -0.0364, + "num_tokens": 62442195.0, + "reward": 0.78125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.7393453121185303, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 0.5520194172859192, + "sampling/sampling_logp_difference/max": 0.5941720008850098, + "sampling/sampling_logp_difference/mean": 0.013782719150185585, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 172.8125, + "completions/mean_terminated_length": 172.8125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.24885821342468262, + "epoch": 2.4252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07125148858301587, + "kl": 0.1279226541519165, + "learning_rate": 1.093064385490906e-07, + "loss": 0.0011, + "num_tokens": 62467607.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5403485298156738, + "sampling/importance_sampling_ratio/mean": 1.0000057220458984, + "sampling/importance_sampling_ratio/min": 0.6010531783103943, + "sampling/sampling_logp_difference/max": 0.5090718269348145, + "sampling/sampling_logp_difference/mean": 0.01431182585656643, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 255.421875, + "completions/mean_terminated_length": 255.421875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.22771960496902466, + "epoch": 2.426470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7144092251646503, + "kl": 0.08566172420978546, + "learning_rate": 1.0886227736645215e-07, + "loss": -0.0003, + "num_tokens": 62506674.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999967098236084, + "sampling/importance_sampling_ratio/min": 0.2704428732395172, + "sampling/sampling_logp_difference/max": 1.307694435119629, + "sampling/sampling_logp_difference/mean": 0.01397204864770174, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 202.75, + "completions/mean_terminated_length": 202.75, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3569865822792053, + "epoch": 2.4276960784313726, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6258296681879294, + "kl": 0.1403682827949524, + "learning_rate": 1.0841891016861155e-07, + "loss": 0.0063, + "num_tokens": 62540242.0, + "reward": 0.3125, + "reward_std": 0.8389039635658264, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6555317640304565, + "sampling/importance_sampling_ratio/mean": 0.9997969269752502, + "sampling/importance_sampling_ratio/min": 0.3920708894729614, + "sampling/sampling_logp_difference/max": 0.9363126754760742, + "sampling/sampling_logp_difference/mean": 0.01937026157975197, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 178.09375, + "completions/mean_terminated_length": 178.09375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.22271037101745605, + "epoch": 2.428921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.973184989448742, + "kl": 0.10955817997455597, + "learning_rate": 1.0797633785557581e-07, + "loss": 0.0177, + "num_tokens": 62573192.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6316500902175903, + "sampling/importance_sampling_ratio/mean": 1.0002367496490479, + "sampling/importance_sampling_ratio/min": 0.6065034866333008, + "sampling/sampling_logp_difference/max": 0.5000448226928711, + "sampling/sampling_logp_difference/mean": 0.013782722875475883, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 171.28125, + "completions/mean_terminated_length": 171.28125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.23561523854732513, + "epoch": 2.4301470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5343826417136237, + "kl": 0.07251683622598648, + "learning_rate": 1.0753456132573885e-07, + "loss": 0.0383, + "num_tokens": 62604570.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5926032066345215, + "sampling/importance_sampling_ratio/mean": 1.0000483989715576, + "sampling/importance_sampling_ratio/min": 0.47895604372024536, + "sampling/sampling_logp_difference/max": 0.7361464500427246, + "sampling/sampling_logp_difference/mean": 0.013745970092713833, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 195.296875, + "completions/mean_terminated_length": 195.296875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2558283507823944, + "epoch": 2.431372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17368507304050582, + "kl": 0.07469744980335236, + "learning_rate": 1.0709358147587883e-07, + "loss": 0.0008, + "num_tokens": 62636733.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6177139282226562, + "sampling/importance_sampling_ratio/mean": 1.0005903244018555, + "sampling/importance_sampling_ratio/min": 0.4985237419605255, + "sampling/sampling_logp_difference/max": 0.6961040496826172, + "sampling/sampling_logp_difference/mean": 0.015179083682596684, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 231.3125, + "completions/mean_terminated_length": 231.3125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.29034364223480225, + "epoch": 2.4325980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0525714732231246, + "kl": 0.09454121440649033, + "learning_rate": 1.0665339920115718e-07, + "loss": 0.0064, + "num_tokens": 62668737.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6174228191375732, + "sampling/importance_sampling_ratio/mean": 0.9998698830604553, + "sampling/importance_sampling_ratio/min": 0.46674081683158875, + "sampling/sampling_logp_difference/max": 0.7619811296463013, + "sampling/sampling_logp_difference/mean": 0.015458998270332813, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 176.734375, + "completions/mean_terminated_length": 176.734375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2578873038291931, + "epoch": 2.4338235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18534368544284197, + "kl": 0.10618950426578522, + "learning_rate": 1.0621401539511587e-07, + "loss": 0.001, + "num_tokens": 62702896.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.597326636314392, + "sampling/importance_sampling_ratio/mean": 0.9995043277740479, + "sampling/importance_sampling_ratio/min": 0.6241003274917603, + "sampling/sampling_logp_difference/max": 0.47144412994384766, + "sampling/sampling_logp_difference/mean": 0.015324244275689125, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 150.84375, + "completions/mean_terminated_length": 150.84375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.21928420662879944, + "epoch": 2.435049019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059385206113499005, + "kl": 0.07461683452129364, + "learning_rate": 1.0577543094967611e-07, + "loss": 0.0007, + "num_tokens": 62730070.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6468877792358398, + "sampling/importance_sampling_ratio/mean": 0.9989715814590454, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.498887300491333, + "sampling/sampling_logp_difference/mean": 0.014201447367668152, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 219.4375, + "completions/mean_terminated_length": 219.4375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2593943476676941, + "epoch": 2.436274509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04040715339197583, + "kl": 0.059937912970781326, + "learning_rate": 1.053376467551368e-07, + "loss": 0.0006, + "num_tokens": 62761218.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5471819639205933, + "sampling/importance_sampling_ratio/mean": 1.000226616859436, + "sampling/importance_sampling_ratio/min": 0.627091646194458, + "sampling/sampling_logp_difference/max": 0.4666626453399658, + "sampling/sampling_logp_difference/mean": 0.014485637657344341, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 178.234375, + "completions/mean_terminated_length": 178.234375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.20651875436306, + "epoch": 2.4375, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1828434759628255, + "kl": 0.08197721838951111, + "learning_rate": 1.0490066370017181e-07, + "loss": 0.019, + "num_tokens": 62789505.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.450416088104248, + "sampling/importance_sampling_ratio/mean": 1.0003294944763184, + "sampling/importance_sampling_ratio/min": 0.49541762471199036, + "sampling/sampling_logp_difference/max": 0.7023541927337646, + "sampling/sampling_logp_difference/mean": 0.012643104419112206, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 218.671875, + "completions/mean_terminated_length": 218.671875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.17756140232086182, + "epoch": 2.438725490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051933208386524814, + "kl": 0.06588190793991089, + "learning_rate": 1.044644826718295e-07, + "loss": 0.0006, + "num_tokens": 62826188.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997557997703552, + "sampling/importance_sampling_ratio/min": 0.4160601496696472, + "sampling/sampling_logp_difference/max": 0.8769254684448242, + "sampling/sampling_logp_difference/mean": 0.011307923123240471, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 190.59375, + "completions/mean_terminated_length": 190.59375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.24801194667816162, + "epoch": 2.439950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2933570005385122, + "kl": 0.08753550052642822, + "learning_rate": 1.0402910455552916e-07, + "loss": -0.0104, + "num_tokens": 62856754.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006647109985352, + "sampling/importance_sampling_ratio/min": 0.5039380192756653, + "sampling/sampling_logp_difference/max": 0.7099902629852295, + "sampling/sampling_logp_difference/mean": 0.01531485840678215, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 193.03125, + "completions/mean_terminated_length": 193.03125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.26168012619018555, + "epoch": 2.4411764705882355, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7832935941676689, + "kl": 0.11002017557621002, + "learning_rate": 1.0359453023506121e-07, + "loss": -0.0068, + "num_tokens": 62884852.0, + "reward": 0.5625, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5547469854354858, + "sampling/importance_sampling_ratio/mean": 1.0002416372299194, + "sampling/importance_sampling_ratio/min": 0.6142948269844055, + "sampling/sampling_logp_difference/max": 0.48728036880493164, + "sampling/sampling_logp_difference/mean": 0.014116182923316956, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 193.71875, + "completions/mean_terminated_length": 193.71875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.26052984595298767, + "epoch": 2.4424019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8917314980948466, + "kl": 0.08373277634382248, + "learning_rate": 1.0316076059258389e-07, + "loss": 0.0001, + "num_tokens": 62914754.0, + "reward": -0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5302009582519531, + "sampling/importance_sampling_ratio/mean": 1.000388503074646, + "sampling/importance_sampling_ratio/min": 0.5006714463233948, + "sampling/sampling_logp_difference/max": 0.6918051242828369, + "sampling/sampling_logp_difference/mean": 0.014232850633561611, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 174.609375, + "completions/mean_terminated_length": 174.609375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.1979416310787201, + "epoch": 2.443627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3102813169215466, + "kl": 0.06639556586742401, + "learning_rate": 1.0272779650862185e-07, + "loss": -0.0236, + "num_tokens": 62945577.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.6545871496200562, + "sampling/importance_sampling_ratio/mean": 1.0002236366271973, + "sampling/importance_sampling_ratio/min": 0.6114649772644043, + "sampling/sampling_logp_difference/max": 0.5035514831542969, + "sampling/sampling_logp_difference/mean": 0.012921427376568317, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 192.96875, + "completions/mean_terminated_length": 192.96875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3212936520576477, + "epoch": 2.4448529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2865328555504383, + "kl": 0.1091475859284401, + "learning_rate": 1.0229563886206516e-07, + "loss": 0.0219, + "num_tokens": 62977271.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6545579433441162, + "sampling/importance_sampling_ratio/mean": 0.9995372295379639, + "sampling/importance_sampling_ratio/min": 0.5046824216842651, + "sampling/sampling_logp_difference/max": 0.6838259696960449, + "sampling/sampling_logp_difference/mean": 0.01750960759818554, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 145.265625, + "completions/mean_terminated_length": 145.265625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.20676608383655548, + "epoch": 2.446078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05716501569543919, + "kl": 0.06579281389713287, + "learning_rate": 1.0186428853016604e-07, + "loss": 0.0007, + "num_tokens": 63007944.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9099797010421753, + "sampling/importance_sampling_ratio/mean": 1.000302791595459, + "sampling/importance_sampling_ratio/min": 0.5734854936599731, + "sampling/sampling_logp_difference/max": 0.6470925807952881, + "sampling/sampling_logp_difference/mean": 0.013321581296622753, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 182.671875, + "completions/mean_terminated_length": 182.671875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.2063009887933731, + "epoch": 2.4473039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05980800829816122, + "kl": 0.09165454655885696, + "learning_rate": 1.0143374638853891e-07, + "loss": 0.0009, + "num_tokens": 63035635.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.575442910194397, + "sampling/importance_sampling_ratio/mean": 0.9995609521865845, + "sampling/importance_sampling_ratio/min": 0.432391881942749, + "sampling/sampling_logp_difference/max": 0.8384230136871338, + "sampling/sampling_logp_difference/mean": 0.014535813592374325, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 212.34375, + "completions/mean_terminated_length": 212.34375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2586808204650879, + "epoch": 2.448529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.027373811906634, + "kl": 0.0688728392124176, + "learning_rate": 1.0100401331115638e-07, + "loss": 0.0055, + "num_tokens": 63067177.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5745112895965576, + "sampling/importance_sampling_ratio/mean": 0.9992469549179077, + "sampling/importance_sampling_ratio/min": 0.6117782592773438, + "sampling/sampling_logp_difference/max": 0.49138545989990234, + "sampling/sampling_logp_difference/mean": 0.013521851971745491, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 212.3125, + "completions/mean_terminated_length": 212.3125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.25559788942337036, + "epoch": 2.4497549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04906564084842952, + "kl": 0.0891442596912384, + "learning_rate": 1.0057509017034977e-07, + "loss": 0.0008, + "num_tokens": 63098813.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6321675777435303, + "sampling/importance_sampling_ratio/mean": 1.0008788108825684, + "sampling/importance_sampling_ratio/min": 0.6126702427864075, + "sampling/sampling_logp_difference/max": 0.4899284839630127, + "sampling/sampling_logp_difference/mean": 0.015120787546038628, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 164.484375, + "completions/mean_terminated_length": 164.484375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.20930957794189453, + "epoch": 2.450980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06804692246110695, + "kl": 0.07293498516082764, + "learning_rate": 1.001469778368057e-07, + "loss": 0.0007, + "num_tokens": 63124828.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.696546196937561, + "sampling/importance_sampling_ratio/mean": 0.9996410608291626, + "sampling/importance_sampling_ratio/min": 0.43092525005340576, + "sampling/sampling_logp_difference/max": 0.8418207168579102, + "sampling/sampling_logp_difference/mean": 0.013784918002784252, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 155.09375, + "completions/mean_terminated_length": 155.09375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.27410781383514404, + "epoch": 2.452205882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4272336769540415, + "kl": 0.09052129089832306, + "learning_rate": 9.971967717956531e-08, + "loss": 0.0204, + "num_tokens": 63162322.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6674655675888062, + "sampling/importance_sampling_ratio/mean": 0.9999108910560608, + "sampling/importance_sampling_ratio/min": 0.3962820768356323, + "sampling/sampling_logp_difference/max": 0.9256290197372437, + "sampling/sampling_logp_difference/mean": 0.01571957767009735, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 168.390625, + "completions/mean_terminated_length": 168.390625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2763534486293793, + "epoch": 2.4534313725490198, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7039991687458058, + "kl": 0.10870914906263351, + "learning_rate": 9.929318906602174e-08, + "loss": 0.0185, + "num_tokens": 63189323.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5515753030776978, + "sampling/importance_sampling_ratio/mean": 0.9997731447219849, + "sampling/importance_sampling_ratio/min": 0.44878554344177246, + "sampling/sampling_logp_difference/max": 0.8012101054191589, + "sampling/sampling_logp_difference/mean": 0.016712991520762444, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 234.671875, + "completions/mean_terminated_length": 234.671875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.27719780802726746, + "epoch": 2.454656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.877160944103908, + "kl": 0.08174227178096771, + "learning_rate": 9.886751436191871e-08, + "loss": -0.0024, + "num_tokens": 63224134.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6231054067611694, + "sampling/importance_sampling_ratio/mean": 0.9999527931213379, + "sampling/importance_sampling_ratio/min": 0.5397012829780579, + "sampling/sampling_logp_difference/max": 0.6167394518852234, + "sampling/sampling_logp_difference/mean": 0.014655927196145058, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 205.109375, + "completions/mean_terminated_length": 205.109375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2259541153907776, + "epoch": 2.4558823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6899458495436732, + "kl": 0.09486418217420578, + "learning_rate": 9.844265393134926e-08, + "loss": 0.0107, + "num_tokens": 63256829.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002696514129639, + "sampling/importance_sampling_ratio/min": 0.058544568717479706, + "sampling/sampling_logp_difference/max": 2.8379669189453125, + "sampling/sampling_logp_difference/mean": 0.015723761171102524, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 157.25, + "completions/mean_terminated_length": 157.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.27056199312210083, + "epoch": 2.457107843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08672783765579291, + "kl": 0.13522277772426605, + "learning_rate": 9.801860863675266e-08, + "loss": 0.0014, + "num_tokens": 63286797.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.831900954246521, + "sampling/importance_sampling_ratio/mean": 1.0001976490020752, + "sampling/importance_sampling_ratio/min": 0.3242254853248596, + "sampling/sampling_logp_difference/max": 1.1263160705566406, + "sampling/sampling_logp_difference/mean": 0.01663844659924507, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 212.90625, + "completions/mean_terminated_length": 212.90625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.23600170016288757, + "epoch": 2.4583333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06562357598801245, + "kl": 0.09271427989006042, + "learning_rate": 9.759537933891421e-08, + "loss": 0.0009, + "num_tokens": 63316615.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7565416097640991, + "sampling/importance_sampling_ratio/mean": 1.0001403093338013, + "sampling/importance_sampling_ratio/min": 0.5815595984458923, + "sampling/sampling_logp_difference/max": 0.5633468627929688, + "sampling/sampling_logp_difference/mean": 0.014578972943127155, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 174.84375, + "completions/mean_terminated_length": 174.84375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2847607433795929, + "epoch": 2.4595588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0359462268745783, + "kl": 0.16529709100723267, + "learning_rate": 9.71729668969628e-08, + "loss": -0.026, + "num_tokens": 63345773.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5527472496032715, + "sampling/importance_sampling_ratio/mean": 0.9999092221260071, + "sampling/importance_sampling_ratio/min": 0.5738064050674438, + "sampling/sampling_logp_difference/max": 0.5554631948471069, + "sampling/sampling_logp_difference/mean": 0.01704687811434269, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 162.265625, + "completions/mean_terminated_length": 162.265625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2768649160861969, + "epoch": 2.4607843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2699946394772597, + "kl": 0.112728051841259, + "learning_rate": 9.67513721683687e-08, + "loss": -0.0048, + "num_tokens": 63371598.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6522746086120605, + "sampling/importance_sampling_ratio/mean": 0.9994333386421204, + "sampling/importance_sampling_ratio/min": 0.6203518509864807, + "sampling/sampling_logp_difference/max": 0.5021529197692871, + "sampling/sampling_logp_difference/mean": 0.015261069871485233, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 210.484375, + "completions/mean_terminated_length": 210.484375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2198856770992279, + "epoch": 2.4620098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2203288813385462, + "kl": 0.06088901311159134, + "learning_rate": 9.633059600894256e-08, + "loss": -0.077, + "num_tokens": 63410125.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.602226972579956, + "sampling/importance_sampling_ratio/mean": 0.9999580383300781, + "sampling/importance_sampling_ratio/min": 0.485906183719635, + "sampling/sampling_logp_difference/max": 0.7217397689819336, + "sampling/sampling_logp_difference/mean": 0.013644246384501457, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 173.609375, + "completions/mean_terminated_length": 173.609375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.24299579858779907, + "epoch": 2.463235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5943768688521742, + "kl": 0.06873179972171783, + "learning_rate": 9.59106392728331e-08, + "loss": -0.0071, + "num_tokens": 63441172.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000420331954956, + "sampling/importance_sampling_ratio/min": 0.6032364964485168, + "sampling/sampling_logp_difference/max": 0.708549976348877, + "sampling/sampling_logp_difference/mean": 0.014463325031101704, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 198.546875, + "completions/mean_terminated_length": 198.546875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.23096302151679993, + "epoch": 2.4644607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3059021548108523, + "kl": 0.07095976173877716, + "learning_rate": 9.549150281252632e-08, + "loss": 0.0013, + "num_tokens": 63476039.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005818605422974, + "sampling/importance_sampling_ratio/min": 0.2528098523616791, + "sampling/sampling_logp_difference/max": 1.3751176595687866, + "sampling/sampling_logp_difference/mean": 0.01635436713695526, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 207.671875, + "completions/mean_terminated_length": 207.671875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.27460724115371704, + "epoch": 2.465686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4587355614766577, + "kl": 0.08916577696800232, + "learning_rate": 9.507318747884241e-08, + "loss": -0.0033, + "num_tokens": 63509426.0, + "reward": 0.375, + "reward_std": 0.481805682182312, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000704526901245, + "sampling/importance_sampling_ratio/min": 0.3304603397846222, + "sampling/sampling_logp_difference/max": 1.1072685718536377, + "sampling/sampling_logp_difference/mean": 0.013697458431124687, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 172.171875, + "completions/mean_terminated_length": 172.171875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2268461287021637, + "epoch": 2.4669117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9820631840838865, + "kl": 0.1107923835515976, + "learning_rate": 9.465569412093488e-08, + "loss": -0.0085, + "num_tokens": 63536013.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001955032348633, + "sampling/importance_sampling_ratio/min": 0.5419043898582458, + "sampling/sampling_logp_difference/max": 1.5411171913146973, + "sampling/sampling_logp_difference/mean": 0.013624858111143112, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 232.78125, + "completions/mean_terminated_length": 232.78125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2935520112514496, + "epoch": 2.468137254901961, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8649602721943874, + "kl": 0.1160188540816307, + "learning_rate": 9.423902358628916e-08, + "loss": 0.0359, + "num_tokens": 63575823.0, + "reward": 0.5, + "reward_std": 0.5879635810852051, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994773864746094, + "sampling/importance_sampling_ratio/min": 0.5942912697792053, + "sampling/sampling_logp_difference/max": 0.7240054607391357, + "sampling/sampling_logp_difference/mean": 0.014655300416052341, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 222.375, + "completions/mean_terminated_length": 222.375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.22237110137939453, + "epoch": 2.469362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.149968194477422, + "kl": 0.06632451713085175, + "learning_rate": 9.382317672071966e-08, + "loss": -0.0339, + "num_tokens": 63603543.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6439599990844727, + "sampling/importance_sampling_ratio/mean": 0.9999493360519409, + "sampling/importance_sampling_ratio/min": 0.5227307677268982, + "sampling/sampling_logp_difference/max": 0.648688793182373, + "sampling/sampling_logp_difference/mean": 0.014232446439564228, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 225.3125, + "completions/mean_terminated_length": 225.3125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.23953120410442352, + "epoch": 2.4705882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1410351693893919, + "kl": 0.09374180436134338, + "learning_rate": 9.340815436836963e-08, + "loss": 0.0009, + "num_tokens": 63636507.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9712061882019043, + "sampling/importance_sampling_ratio/mean": 1.0000889301300049, + "sampling/importance_sampling_ratio/min": 0.5806989669799805, + "sampling/sampling_logp_difference/max": 0.6786456108093262, + "sampling/sampling_logp_difference/mean": 0.015163847245275974, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 251.96875, + "completions/mean_terminated_length": 251.96875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.21348419785499573, + "epoch": 2.471813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.991835265185055, + "kl": 0.06538738310337067, + "learning_rate": 9.299395737170757e-08, + "loss": -0.034, + "num_tokens": 63669849.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.8819410800933838, + "sampling/importance_sampling_ratio/mean": 0.999263346195221, + "sampling/importance_sampling_ratio/min": 0.08145135641098022, + "sampling/sampling_logp_difference/max": 2.507749319076538, + "sampling/sampling_logp_difference/mean": 0.012916372157633305, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 168.96875, + "completions/mean_terminated_length": 168.96875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.19465279579162598, + "epoch": 2.4730392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05307041611124714, + "kl": 0.08682794123888016, + "learning_rate": 9.258058657152761e-08, + "loss": 0.0008, + "num_tokens": 63697975.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000096082687378, + "sampling/importance_sampling_ratio/min": 0.6017568111419678, + "sampling/sampling_logp_difference/max": 0.841266393661499, + "sampling/sampling_logp_difference/mean": 0.012608527205884457, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 187.125, + "completions/mean_terminated_length": 187.125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.3037737011909485, + "epoch": 2.474264705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0234115255557776, + "kl": 0.18698279559612274, + "learning_rate": 9.216804280694612e-08, + "loss": -0.0039, + "num_tokens": 63726383.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9514977931976318, + "sampling/importance_sampling_ratio/mean": 0.999690055847168, + "sampling/importance_sampling_ratio/min": 0.5022098422050476, + "sampling/sampling_logp_difference/max": 0.6887372732162476, + "sampling/sampling_logp_difference/mean": 0.015550685115158558, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 206.421875, + "completions/mean_terminated_length": 206.421875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.23022770881652832, + "epoch": 2.4754901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2283473709930766, + "kl": 0.07669369131326675, + "learning_rate": 9.175632691540064e-08, + "loss": 0.0925, + "num_tokens": 63760714.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.64573335647583, + "sampling/importance_sampling_ratio/mean": 0.9996827840805054, + "sampling/importance_sampling_ratio/min": 0.5268608331680298, + "sampling/sampling_logp_difference/max": 0.6408188343048096, + "sampling/sampling_logp_difference/mean": 0.014048980548977852, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 133.34375, + "completions/mean_terminated_length": 133.34375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.19566136598587036, + "epoch": 2.4767156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.1620033596651016, + "kl": 0.07578049600124359, + "learning_rate": 9.134543973264868e-08, + "loss": 0.0124, + "num_tokens": 63780688.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5518295764923096, + "sampling/importance_sampling_ratio/mean": 0.9996536374092102, + "sampling/importance_sampling_ratio/min": 0.5661754608154297, + "sampling/sampling_logp_difference/max": 0.5688512325286865, + "sampling/sampling_logp_difference/mean": 0.013932591304183006, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 212.421875, + "completions/mean_terminated_length": 212.421875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2625789940357208, + "epoch": 2.4779411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3469507053829686, + "kl": 0.08934664726257324, + "learning_rate": 9.093538209276486e-08, + "loss": -0.0173, + "num_tokens": 63810011.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.8731153011322021, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.5131204724311829, + "sampling/sampling_logp_difference/max": 0.6672446727752686, + "sampling/sampling_logp_difference/mean": 0.014829147607088089, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 217.03125, + "completions/mean_terminated_length": 217.03125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2258068323135376, + "epoch": 2.4791666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04342160452499901, + "kl": 0.08009222894906998, + "learning_rate": 9.052615482814069e-08, + "loss": 0.0007, + "num_tokens": 63846445.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7562291622161865, + "sampling/importance_sampling_ratio/mean": 0.9999803900718689, + "sampling/importance_sampling_ratio/min": 0.48317864537239075, + "sampling/sampling_logp_difference/max": 0.7273688316345215, + "sampling/sampling_logp_difference/mean": 0.012380572967231274, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 232.25, + "completions/mean_terminated_length": 232.25, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.29769203066825867, + "epoch": 2.480392156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.532320422530228, + "kl": 0.12566164135932922, + "learning_rate": 9.011775876948096e-08, + "loss": 0.0733, + "num_tokens": 63876381.0, + "reward": 0.78125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967217445374, + "sampling/importance_sampling_ratio/min": 0.5362070202827454, + "sampling/sampling_logp_difference/max": 0.710334300994873, + "sampling/sampling_logp_difference/mean": 0.01466565765440464, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 180.265625, + "completions/mean_terminated_length": 180.265625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.267097145318985, + "epoch": 2.4816176470588234, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.920288812577845, + "kl": 0.12502649426460266, + "learning_rate": 8.971019474580427e-08, + "loss": 0.0031, + "num_tokens": 63901406.0, + "reward": 0.0625, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5178194046020508, + "sampling/importance_sampling_ratio/mean": 1.0002391338348389, + "sampling/importance_sampling_ratio/min": 0.6014291048049927, + "sampling/sampling_logp_difference/max": 0.5084466934204102, + "sampling/sampling_logp_difference/mean": 0.015463524498045444, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 179.734375, + "completions/mean_terminated_length": 179.734375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.2664422392845154, + "epoch": 2.482843137254902, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5975967423978235, + "kl": 0.10327211022377014, + "learning_rate": 8.930346358443953e-08, + "loss": 0.0818, + "num_tokens": 63927581.0, + "reward": -0.03125, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4618009328842163, + "sampling/importance_sampling_ratio/mean": 0.9999598264694214, + "sampling/importance_sampling_ratio/min": 0.4522865414619446, + "sampling/sampling_logp_difference/max": 0.7934393882751465, + "sampling/sampling_logp_difference/mean": 0.01469662319868803, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 163.984375, + "completions/mean_terminated_length": 163.984375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.2569267749786377, + "epoch": 2.4840686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08712411280874989, + "kl": 0.10711999982595444, + "learning_rate": 8.889756611102539e-08, + "loss": 0.0011, + "num_tokens": 63952604.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.630460500717163, + "sampling/importance_sampling_ratio/mean": 0.9999353289604187, + "sampling/importance_sampling_ratio/min": 0.46057137846946716, + "sampling/sampling_logp_difference/max": 0.775287389755249, + "sampling/sampling_logp_difference/mean": 0.016630226746201515, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 202.3125, + "completions/mean_terminated_length": 202.3125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2259894609451294, + "epoch": 2.485294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.228854813862446, + "kl": 0.05850193649530411, + "learning_rate": 8.84925031495079e-08, + "loss": -0.0875, + "num_tokens": 63982752.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6272915601730347, + "sampling/importance_sampling_ratio/mean": 1.0000848770141602, + "sampling/importance_sampling_ratio/min": 0.47421547770500183, + "sampling/sampling_logp_difference/max": 0.7460935115814209, + "sampling/sampling_logp_difference/mean": 0.01230490393936634, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 174.609375, + "completions/mean_terminated_length": 174.609375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.2859165668487549, + "epoch": 2.486519607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0031900250778567, + "kl": 0.11168570816516876, + "learning_rate": 8.808827552213916e-08, + "loss": -0.0181, + "num_tokens": 64008311.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999197125434875, + "sampling/importance_sampling_ratio/min": 0.5540803074836731, + "sampling/sampling_logp_difference/max": 0.6932724714279175, + "sampling/sampling_logp_difference/mean": 0.01680031418800354, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 141.59375, + "completions/mean_terminated_length": 141.59375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.246006578207016, + "epoch": 2.4877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05821484953334174, + "kl": 0.08178937435150146, + "learning_rate": 8.768488404947593e-08, + "loss": 0.0008, + "num_tokens": 64033757.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.640612244606018, + "sampling/importance_sampling_ratio/mean": 0.9998109340667725, + "sampling/importance_sampling_ratio/min": 0.5501823425292969, + "sampling/sampling_logp_difference/max": 0.5975055694580078, + "sampling/sampling_logp_difference/mean": 0.015690255910158157, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 196.015625, + "completions/mean_terminated_length": 196.015625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.24520564079284668, + "epoch": 2.488970588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042374916409275605, + "kl": 0.06653556227684021, + "learning_rate": 8.728232955037696e-08, + "loss": 0.0006, + "num_tokens": 64063710.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7712163925170898, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.5895556807518005, + "sampling/sampling_logp_difference/max": 0.5716664791107178, + "sampling/sampling_logp_difference/mean": 0.01448611356317997, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 188.828125, + "completions/mean_terminated_length": 188.828125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.16599786281585693, + "epoch": 2.4901960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07786440309286723, + "kl": 0.05656210705637932, + "learning_rate": 8.688061284200265e-08, + "loss": 0.0006, + "num_tokens": 64094675.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007353067398071, + "sampling/importance_sampling_ratio/mean": 1.0000226497650146, + "sampling/importance_sampling_ratio/min": 0.319061279296875, + "sampling/sampling_logp_difference/max": 1.1423721313476562, + "sampling/sampling_logp_difference/mean": 0.010407873429358006, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 222.3125, + "completions/mean_terminated_length": 222.3125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2846994698047638, + "epoch": 2.491421568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8236398880872422, + "kl": 0.16061674058437347, + "learning_rate": 8.647973473981224e-08, + "loss": -0.0055, + "num_tokens": 64128551.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5744246244430542, + "sampling/importance_sampling_ratio/mean": 0.9995661973953247, + "sampling/importance_sampling_ratio/min": 0.4824768006801605, + "sampling/sampling_logp_difference/max": 0.7288224697113037, + "sampling/sampling_logp_difference/mean": 0.015534237027168274, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 172.09375, + "completions/mean_terminated_length": 172.09375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.21926692128181458, + "epoch": 2.4926470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04258034793347578, + "kl": 0.05456060916185379, + "learning_rate": 8.607969605756315e-08, + "loss": 0.0006, + "num_tokens": 64157933.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999468207359314, + "sampling/importance_sampling_ratio/min": 0.5684303045272827, + "sampling/sampling_logp_difference/max": 0.986548900604248, + "sampling/sampling_logp_difference/mean": 0.013844440691173077, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 225.171875, + "completions/mean_terminated_length": 225.171875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2624910771846771, + "epoch": 2.493872549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7775404778616285, + "kl": 0.07403460144996643, + "learning_rate": 8.568049760730838e-08, + "loss": 0.0044, + "num_tokens": 64193496.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6019309759140015, + "sampling/importance_sampling_ratio/mean": 1.0007879734039307, + "sampling/importance_sampling_ratio/min": 0.552937924861908, + "sampling/sampling_logp_difference/max": 0.5925095081329346, + "sampling/sampling_logp_difference/mean": 0.014647210016846657, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 176.640625, + "completions/mean_terminated_length": 176.640625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3022158443927765, + "epoch": 2.4950980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.992703879495102, + "kl": 0.10479722917079926, + "learning_rate": 8.52821401993955e-08, + "loss": -0.0042, + "num_tokens": 64223681.0, + "reward": 0.46875, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0010337829589844, + "sampling/importance_sampling_ratio/min": 0.6119037866592407, + "sampling/sampling_logp_difference/max": 0.8013436794281006, + "sampling/sampling_logp_difference/mean": 0.016235051676630974, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 156.0, + "completions/mean_terminated_length": 156.0, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.20266065001487732, + "epoch": 2.4963235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04784982307143272, + "kl": 0.05678699165582657, + "learning_rate": 8.488462464246493e-08, + "loss": 0.0006, + "num_tokens": 64253953.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.720233678817749, + "sampling/importance_sampling_ratio/mean": 0.9993163347244263, + "sampling/importance_sampling_ratio/min": 0.6155776381492615, + "sampling/sampling_logp_difference/max": 0.5424602031707764, + "sampling/sampling_logp_difference/mean": 0.01326032169163227, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 152.203125, + "completions/mean_terminated_length": 152.203125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.18145807087421417, + "epoch": 2.497549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4403634850359908, + "kl": 0.07998235523700714, + "learning_rate": 8.448795174344803e-08, + "loss": -0.0118, + "num_tokens": 64281198.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.651445984840393, + "sampling/importance_sampling_ratio/mean": 1.0006821155548096, + "sampling/importance_sampling_ratio/min": 0.639241635799408, + "sampling/sampling_logp_difference/max": 0.5016512870788574, + "sampling/sampling_logp_difference/mean": 0.012963157147169113, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 150.546875, + "completions/mean_terminated_length": 150.546875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.22763216495513916, + "epoch": 2.498774509803922, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.260811076751465, + "kl": 0.08246447145938873, + "learning_rate": 8.409212230756563e-08, + "loss": 0.0014, + "num_tokens": 64305345.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6254801750183105, + "sampling/importance_sampling_ratio/mean": 1.0008478164672852, + "sampling/importance_sampling_ratio/min": 0.5552654266357422, + "sampling/sampling_logp_difference/max": 0.5883090496063232, + "sampling/sampling_logp_difference/mean": 0.013004804030060768, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 208.71875, + "completions/mean_terminated_length": 208.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2567097246646881, + "epoch": 2.5, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8692939451667037, + "kl": 0.09157007932662964, + "learning_rate": 8.369713713832622e-08, + "loss": -0.0186, + "num_tokens": 64338159.0, + "reward": -0.15625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001546144485474, + "sampling/importance_sampling_ratio/min": 0.4440856873989105, + "sampling/sampling_logp_difference/max": 0.9236316680908203, + "sampling/sampling_logp_difference/mean": 0.014840789139270782, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 154.296875, + "completions/mean_terminated_length": 154.296875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.28439900279045105, + "epoch": 2.501225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6465117330095138, + "kl": 0.12006236612796783, + "learning_rate": 8.330299703752497e-08, + "loss": 0.0087, + "num_tokens": 64368914.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5714937448501587, + "sampling/importance_sampling_ratio/mean": 0.999998152256012, + "sampling/importance_sampling_ratio/min": 0.5982145667076111, + "sampling/sampling_logp_difference/max": 0.5138057470321655, + "sampling/sampling_logp_difference/mean": 0.01627548784017563, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 202.078125, + "completions/mean_terminated_length": 202.078125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.2615170478820801, + "epoch": 2.502450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.383059434915081, + "kl": 0.07800057530403137, + "learning_rate": 8.290970280524124e-08, + "loss": 0.0182, + "num_tokens": 64396679.0, + "reward": -0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.7811089754104614, + "sampling/importance_sampling_ratio/mean": 0.9999487996101379, + "sampling/importance_sampling_ratio/min": 0.37332433462142944, + "sampling/sampling_logp_difference/max": 0.9853076934814453, + "sampling/sampling_logp_difference/mean": 0.014414073899388313, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 145.546875, + "completions/mean_terminated_length": 145.546875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.18265660107135773, + "epoch": 2.5036764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5947788116146797, + "kl": 0.08648387342691422, + "learning_rate": 8.251725523983722e-08, + "loss": -0.0145, + "num_tokens": 64421530.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.7496325969696045, + "sampling/importance_sampling_ratio/mean": 1.0000611543655396, + "sampling/importance_sampling_ratio/min": 0.25419026613235474, + "sampling/sampling_logp_difference/max": 1.369672179222107, + "sampling/sampling_logp_difference/mean": 0.012066777795553207, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 196.875, + "completions/mean_terminated_length": 196.875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2852139472961426, + "epoch": 2.5049019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1845940609833618, + "kl": 0.1319739818572998, + "learning_rate": 8.212565513795683e-08, + "loss": 0.0052, + "num_tokens": 64454354.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5439215898513794, + "sampling/importance_sampling_ratio/mean": 1.0001826286315918, + "sampling/importance_sampling_ratio/min": 0.6155157089233398, + "sampling/sampling_logp_difference/max": 0.48529481887817383, + "sampling/sampling_logp_difference/mean": 0.015069638378918171, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 200.6875, + "completions/mean_terminated_length": 200.6875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.21678279340267181, + "epoch": 2.506127450980392, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4893007785410823, + "kl": 0.05048329383134842, + "learning_rate": 8.173490329452343e-08, + "loss": 0.042, + "num_tokens": 64485134.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4760957956314087, + "sampling/importance_sampling_ratio/mean": 0.9999352693557739, + "sampling/importance_sampling_ratio/min": 0.610935628414154, + "sampling/sampling_logp_difference/max": 0.4927637577056885, + "sampling/sampling_logp_difference/mean": 0.012355451472103596, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 161.359375, + "completions/mean_terminated_length": 161.359375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.27877625823020935, + "epoch": 2.5073529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059946639291397104, + "kl": 0.08982808142900467, + "learning_rate": 8.13450005027384e-08, + "loss": 0.0009, + "num_tokens": 64512773.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.630943775177002, + "sampling/importance_sampling_ratio/mean": 0.9997053146362305, + "sampling/importance_sampling_ratio/min": 0.3100298345088959, + "sampling/sampling_logp_difference/max": 1.1710867881774902, + "sampling/sampling_logp_difference/mean": 0.01694381982088089, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 204.859375, + "completions/mean_terminated_length": 204.859375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2658664584159851, + "epoch": 2.508578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.201674207561085, + "kl": 0.07240475714206696, + "learning_rate": 8.09559475540797e-08, + "loss": 0.0524, + "num_tokens": 64544076.0, + "reward": 0.25, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995499849319458, + "sampling/importance_sampling_ratio/min": 0.49081358313560486, + "sampling/sampling_logp_difference/max": 0.7116909027099609, + "sampling/sampling_logp_difference/mean": 0.015354299917817116, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 167.328125, + "completions/mean_terminated_length": 167.328125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.17701229453086853, + "epoch": 2.5098039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05025101002355718, + "kl": 0.04492766410112381, + "learning_rate": 8.056774523830029e-08, + "loss": 0.0005, + "num_tokens": 64568097.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5922887325286865, + "sampling/importance_sampling_ratio/mean": 1.0001029968261719, + "sampling/importance_sampling_ratio/min": 0.3485910892486572, + "sampling/sampling_logp_difference/max": 1.0538556575775146, + "sampling/sampling_logp_difference/mean": 0.011708034202456474, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 186.265625, + "completions/mean_terminated_length": 186.265625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.24314233660697937, + "epoch": 2.5110294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5858602248940317, + "kl": 0.08327071368694305, + "learning_rate": 8.018039434342627e-08, + "loss": -0.1419, + "num_tokens": 64597474.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5546276569366455, + "sampling/importance_sampling_ratio/mean": 1.0001580715179443, + "sampling/importance_sampling_ratio/min": 0.5519617795944214, + "sampling/sampling_logp_difference/max": 0.5942764282226562, + "sampling/sampling_logp_difference/mean": 0.01384375523775816, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 170.984375, + "completions/mean_terminated_length": 170.984375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2665937542915344, + "epoch": 2.5122549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.005298918913521, + "kl": 0.09226707369089127, + "learning_rate": 7.979389565575522e-08, + "loss": 0.0282, + "num_tokens": 64630401.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002641677856445, + "sampling/importance_sampling_ratio/min": 0.5641261339187622, + "sampling/sampling_logp_difference/max": 0.8223857879638672, + "sampling/sampling_logp_difference/mean": 0.016130205243825912, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 190.421875, + "completions/mean_terminated_length": 190.421875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3023907542228699, + "epoch": 2.513480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3700430215733999, + "kl": 0.14257675409317017, + "learning_rate": 7.940824995985528e-08, + "loss": -0.021, + "num_tokens": 64658876.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6105904579162598, + "sampling/importance_sampling_ratio/mean": 0.9993658065795898, + "sampling/importance_sampling_ratio/min": 0.5193053483963013, + "sampling/sampling_logp_difference/max": 0.6552631855010986, + "sampling/sampling_logp_difference/mean": 0.015207895077764988, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 182.3125, + "completions/mean_terminated_length": 182.3125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.22624555230140686, + "epoch": 2.514705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04064950583779161, + "kl": 0.06253501772880554, + "learning_rate": 7.902345803856264e-08, + "loss": 0.0006, + "num_tokens": 64690304.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.844347596168518, + "sampling/importance_sampling_ratio/mean": 1.0002517700195312, + "sampling/importance_sampling_ratio/min": 0.5676478147506714, + "sampling/sampling_logp_difference/max": 0.6121256351470947, + "sampling/sampling_logp_difference/mean": 0.013546247966587543, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 194.859375, + "completions/mean_terminated_length": 194.859375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.19373953342437744, + "epoch": 2.5159313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1898838684284017, + "kl": 0.0509500578045845, + "learning_rate": 7.863952067298041e-08, + "loss": -0.0043, + "num_tokens": 64721047.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004467964172363, + "sampling/importance_sampling_ratio/min": 0.5552655458450317, + "sampling/sampling_logp_difference/max": 0.9446532726287842, + "sampling/sampling_logp_difference/mean": 0.012281032279133797, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 210.875, + "completions/mean_terminated_length": 210.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3182130455970764, + "epoch": 2.517156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6178720617889286, + "kl": 0.16294899582862854, + "learning_rate": 7.825643864247733e-08, + "loss": 0.0121, + "num_tokens": 64757567.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6558222770690918, + "sampling/importance_sampling_ratio/mean": 1.0004451274871826, + "sampling/importance_sampling_ratio/min": 0.6287725567817688, + "sampling/sampling_logp_difference/max": 0.5042977333068848, + "sampling/sampling_logp_difference/mean": 0.01619984209537506, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 168.71875, + "completions/mean_terminated_length": 168.71875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.2119746059179306, + "epoch": 2.5183823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10989378193671345, + "kl": 0.09138666093349457, + "learning_rate": 7.787421272468547e-08, + "loss": 0.0009, + "num_tokens": 64788173.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003702640533447, + "sampling/importance_sampling_ratio/min": 0.6110650897026062, + "sampling/sampling_logp_difference/max": 1.09181809425354, + "sampling/sampling_logp_difference/mean": 0.013934195972979069, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 181.890625, + "completions/mean_terminated_length": 181.890625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3089261054992676, + "epoch": 2.519607843137255, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.452386378197396, + "kl": 0.09943846613168716, + "learning_rate": 7.749284369549952e-08, + "loss": 0.0232, + "num_tokens": 64815158.0, + "reward": 0.3125, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.755895972251892, + "sampling/importance_sampling_ratio/mean": 1.000169277191162, + "sampling/importance_sampling_ratio/min": 0.34555506706237793, + "sampling/sampling_logp_difference/max": 1.062603235244751, + "sampling/sampling_logp_difference/mean": 0.015521117486059666, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 159.265625, + "completions/mean_terminated_length": 159.265625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.29940080642700195, + "epoch": 2.5208333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.508780815193096, + "kl": 0.10483361780643463, + "learning_rate": 7.711233232907399e-08, + "loss": 0.0639, + "num_tokens": 64843287.0, + "reward": 0.625, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6515302658081055, + "sampling/importance_sampling_ratio/mean": 0.9991793632507324, + "sampling/importance_sampling_ratio/min": 0.6066753268241882, + "sampling/sampling_logp_difference/max": 0.5017023086547852, + "sampling/sampling_logp_difference/mean": 0.017587456852197647, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 168.3125, + "completions/mean_terminated_length": 168.3125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.2717065215110779, + "epoch": 2.5220588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6533966641128728, + "kl": 0.08779462426900864, + "learning_rate": 7.673267939782324e-08, + "loss": -0.0128, + "num_tokens": 64875995.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.708302617073059, + "sampling/importance_sampling_ratio/mean": 0.9995182156562805, + "sampling/importance_sampling_ratio/min": 0.27822449803352356, + "sampling/sampling_logp_difference/max": 1.2793269157409668, + "sampling/sampling_logp_difference/mean": 0.015954112634062767, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 207.78125, + "completions/mean_terminated_length": 207.78125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.20165926218032837, + "epoch": 2.5232843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03778193338893196, + "kl": 0.06783895939588547, + "learning_rate": 7.63538856724184e-08, + "loss": 0.0007, + "num_tokens": 64910653.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5551292896270752, + "sampling/importance_sampling_ratio/mean": 1.0003447532653809, + "sampling/importance_sampling_ratio/min": 0.5010187029838562, + "sampling/sampling_logp_difference/max": 0.6911118030548096, + "sampling/sampling_logp_difference/mean": 0.012874551117420197, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 149.984375, + "completions/mean_terminated_length": 149.984375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.20012640953063965, + "epoch": 2.5245098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1445528136134168, + "kl": 0.07653352618217468, + "learning_rate": 7.597595192178702e-08, + "loss": 0.0008, + "num_tokens": 64936588.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004346370697021, + "sampling/importance_sampling_ratio/min": 0.25459831953048706, + "sampling/sampling_logp_difference/max": 1.5598270893096924, + "sampling/sampling_logp_difference/mean": 0.015426107682287693, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 181.28125, + "completions/mean_terminated_length": 181.28125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.22334742546081543, + "epoch": 2.525735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6107262833998872, + "kl": 0.07174746692180634, + "learning_rate": 7.559887891311046e-08, + "loss": 0.01, + "num_tokens": 64963470.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6041007041931152, + "sampling/importance_sampling_ratio/mean": 1.0001410245895386, + "sampling/importance_sampling_ratio/min": 0.6211667060852051, + "sampling/sampling_logp_difference/max": 0.47615575790405273, + "sampling/sampling_logp_difference/mean": 0.013342998921871185, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 168.4375, + "completions/mean_terminated_length": 168.4375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.29684188961982727, + "epoch": 2.5269607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8925817098884088, + "kl": 0.09327543526887894, + "learning_rate": 7.522266741182303e-08, + "loss": 0.0316, + "num_tokens": 64998858.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5071570873260498, + "sampling/importance_sampling_ratio/mean": 0.9994641542434692, + "sampling/importance_sampling_ratio/min": 0.6080344319343567, + "sampling/sampling_logp_difference/max": 0.49752378463745117, + "sampling/sampling_logp_difference/mean": 0.015332784503698349, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 202.796875, + "completions/mean_terminated_length": 202.796875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.28393346071243286, + "epoch": 2.528186274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6044857309462037, + "kl": 0.09946949034929276, + "learning_rate": 7.484731818161049e-08, + "loss": -0.0077, + "num_tokens": 65026365.0, + "reward": 0.21875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6209652423858643, + "sampling/importance_sampling_ratio/mean": 0.9997640252113342, + "sampling/importance_sampling_ratio/min": 0.4355246424674988, + "sampling/sampling_logp_difference/max": 0.8312039375305176, + "sampling/sampling_logp_difference/mean": 0.014704588800668716, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 132.09375, + "completions/mean_terminated_length": 132.09375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2502855062484741, + "epoch": 2.5294117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12678763176978367, + "kl": 0.1325320154428482, + "learning_rate": 7.447283198440763e-08, + "loss": 0.0013, + "num_tokens": 65049427.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.51406991481781, + "sampling/importance_sampling_ratio/mean": 1.0000284910202026, + "sampling/importance_sampling_ratio/min": 0.6081269383430481, + "sampling/sampling_logp_difference/max": 0.4973716735839844, + "sampling/sampling_logp_difference/mean": 0.015502297319471836, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 174.203125, + "completions/mean_terminated_length": 174.203125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.1999911367893219, + "epoch": 2.530637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4215878182256434, + "kl": 0.08172202110290527, + "learning_rate": 7.409920958039794e-08, + "loss": -0.0051, + "num_tokens": 65085184.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.700395941734314, + "sampling/importance_sampling_ratio/mean": 1.0005156993865967, + "sampling/importance_sampling_ratio/min": 0.5967065691947937, + "sampling/sampling_logp_difference/max": 0.5308611392974854, + "sampling/sampling_logp_difference/mean": 0.011997406370937824, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 171.1875, + "completions/mean_terminated_length": 171.1875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2532813549041748, + "epoch": 2.531862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3322553538983197, + "kl": 0.13121700286865234, + "learning_rate": 7.372645172801112e-08, + "loss": -0.005, + "num_tokens": 65113180.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.7093794345855713, + "sampling/importance_sampling_ratio/mean": 1.0005707740783691, + "sampling/importance_sampling_ratio/min": 0.5436164140701294, + "sampling/sampling_logp_difference/max": 0.6095113754272461, + "sampling/sampling_logp_difference/mean": 0.014983594417572021, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 200.578125, + "completions/mean_terminated_length": 200.578125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.272316038608551, + "epoch": 2.5330882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.677935571660197, + "kl": 0.08236125856637955, + "learning_rate": 7.335455918392219e-08, + "loss": 0.0161, + "num_tokens": 65146721.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.8162810802459717, + "sampling/importance_sampling_ratio/mean": 0.9998995065689087, + "sampling/importance_sampling_ratio/min": 0.6058658361434937, + "sampling/sampling_logp_difference/max": 0.5967910289764404, + "sampling/sampling_logp_difference/mean": 0.01491851918399334, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 194.0, + "completions/mean_terminated_length": 194.0, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2139989733695984, + "epoch": 2.534313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040576103423437294, + "kl": 0.060933761298656464, + "learning_rate": 7.29835327030493e-08, + "loss": 0.0006, + "num_tokens": 65173713.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7566498517990112, + "sampling/importance_sampling_ratio/mean": 1.0005521774291992, + "sampling/importance_sampling_ratio/min": 0.6256545186042786, + "sampling/sampling_logp_difference/max": 0.5634084939956665, + "sampling/sampling_logp_difference/mean": 0.013278011232614517, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 195.578125, + "completions/mean_terminated_length": 195.578125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.27939218282699585, + "epoch": 2.5355392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8515158369657, + "kl": 0.0869421735405922, + "learning_rate": 7.261337303855258e-08, + "loss": 0.0053, + "num_tokens": 65205510.0, + "reward": -0.125, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5609079599380493, + "sampling/importance_sampling_ratio/mean": 0.9996978044509888, + "sampling/importance_sampling_ratio/min": 0.46683257818222046, + "sampling/sampling_logp_difference/max": 0.761784553527832, + "sampling/sampling_logp_difference/mean": 0.01585400104522705, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 172.8125, + "completions/mean_terminated_length": 172.8125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.256533682346344, + "epoch": 2.536764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0299182589679767, + "kl": 0.1013125628232956, + "learning_rate": 7.224408094183299e-08, + "loss": -0.0153, + "num_tokens": 65230906.0, + "reward": 0.34375, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.683823585510254, + "sampling/importance_sampling_ratio/mean": 0.9999561905860901, + "sampling/importance_sampling_ratio/min": 0.3722969591617584, + "sampling/sampling_logp_difference/max": 0.9880634546279907, + "sampling/sampling_logp_difference/mean": 0.015095336362719536, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 179.953125, + "completions/mean_terminated_length": 179.953125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.28137242794036865, + "epoch": 2.5379901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9014808199425561, + "kl": 0.11942119896411896, + "learning_rate": 7.187565716252991e-08, + "loss": 0.0309, + "num_tokens": 65257591.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5801266431808472, + "sampling/importance_sampling_ratio/mean": 1.0004737377166748, + "sampling/importance_sampling_ratio/min": 0.41838201880455017, + "sampling/sampling_logp_difference/max": 0.8713603019714355, + "sampling/sampling_logp_difference/mean": 0.014653488993644714, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 172.0625, + "completions/mean_terminated_length": 172.0625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.20256870985031128, + "epoch": 2.5392156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0938691809454313, + "kl": 0.07591012120246887, + "learning_rate": 7.150810244852035e-08, + "loss": 0.0008, + "num_tokens": 65284891.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.4757631719112396, + "sampling/sampling_logp_difference/max": 0.9235873222351074, + "sampling/sampling_logp_difference/mean": 0.013684678822755814, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 166.90625, + "completions/mean_terminated_length": 166.90625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.26050078868865967, + "epoch": 2.5404411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.568693745447588, + "kl": 0.09759359061717987, + "learning_rate": 7.114141754591691e-08, + "loss": -0.0497, + "num_tokens": 65314533.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8480740785598755, + "sampling/importance_sampling_ratio/mean": 0.9992367029190063, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.6141440868377686, + "sampling/sampling_logp_difference/mean": 0.015216910280287266, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 142.984375, + "completions/mean_terminated_length": 142.984375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.18746830523014069, + "epoch": 2.5416666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11363255287841326, + "kl": 0.06820593774318695, + "learning_rate": 7.077560319906694e-08, + "loss": 0.0007, + "num_tokens": 65340836.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5502012968063354, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.512789785861969, + "sampling/sampling_logp_difference/max": 0.6678893566131592, + "sampling/sampling_logp_difference/mean": 0.01332128793001175, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 221.828125, + "completions/mean_terminated_length": 221.828125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.20960500836372375, + "epoch": 2.542892156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2641235618361277, + "kl": 0.06737308204174042, + "learning_rate": 7.041066015055036e-08, + "loss": -0.0235, + "num_tokens": 65374761.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5975921154022217, + "sampling/importance_sampling_ratio/mean": 0.9998186826705933, + "sampling/importance_sampling_ratio/min": 0.48500892519950867, + "sampling/sampling_logp_difference/max": 0.7235879898071289, + "sampling/sampling_logp_difference/mean": 0.012469634413719177, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 171.015625, + "completions/mean_terminated_length": 171.015625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.20870862901210785, + "epoch": 2.5441176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9795602567650594, + "kl": 0.09434500336647034, + "learning_rate": 7.004658914117822e-08, + "loss": 0.0603, + "num_tokens": 65402202.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.9099678993225098, + "sampling/importance_sampling_ratio/mean": 0.9997029900550842, + "sampling/importance_sampling_ratio/min": 0.5328991413116455, + "sampling/sampling_logp_difference/max": 0.6470863819122314, + "sampling/sampling_logp_difference/mean": 0.013507528230547905, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 149.59375, + "completions/mean_terminated_length": 149.59375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.22802692651748657, + "epoch": 2.545343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4513327872754587, + "kl": 0.10282780975103378, + "learning_rate": 6.968339090999186e-08, + "loss": -0.0054, + "num_tokens": 65430640.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.7198556661605835, + "sampling/importance_sampling_ratio/mean": 0.9994577169418335, + "sampling/importance_sampling_ratio/min": 0.618988037109375, + "sampling/sampling_logp_difference/max": 0.5422403812408447, + "sampling/sampling_logp_difference/mean": 0.013412565924227238, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 183.5625, + "completions/mean_terminated_length": 183.5625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.32695138454437256, + "epoch": 2.5465686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.443674818174446, + "kl": 0.13204100728034973, + "learning_rate": 6.932106619426064e-08, + "loss": -0.0153, + "num_tokens": 65462372.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.424768090248108, + "sampling/importance_sampling_ratio/mean": 1.000126600265503, + "sampling/importance_sampling_ratio/min": 0.44382381439208984, + "sampling/sampling_logp_difference/max": 0.8123276233673096, + "sampling/sampling_logp_difference/mean": 0.016524365171790123, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 166.796875, + "completions/mean_terminated_length": 166.796875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.21823406219482422, + "epoch": 2.547794117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5937634157364964, + "kl": 0.08576545864343643, + "learning_rate": 6.895961572948067e-08, + "loss": -0.0062, + "num_tokens": 65490215.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.7165240049362183, + "sampling/importance_sampling_ratio/mean": 0.999908447265625, + "sampling/importance_sampling_ratio/min": 0.5912283062934875, + "sampling/sampling_logp_difference/max": 0.5403013229370117, + "sampling/sampling_logp_difference/mean": 0.014561184681952, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 188.203125, + "completions/mean_terminated_length": 188.203125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2530749440193176, + "epoch": 2.549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4109219341926849, + "kl": 0.07359974086284637, + "learning_rate": 6.859904024937347e-08, + "loss": -0.0148, + "num_tokens": 65520100.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.9106332063674927, + "sampling/importance_sampling_ratio/mean": 1.0003867149353027, + "sampling/importance_sampling_ratio/min": 0.45605409145355225, + "sampling/sampling_logp_difference/max": 0.7851438522338867, + "sampling/sampling_logp_difference/mean": 0.013720223680138588, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 236.515625, + "completions/mean_terminated_length": 236.515625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.33111363649368286, + "epoch": 2.5502450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3630038149590784, + "kl": 0.11825443804264069, + "learning_rate": 6.823934048588459e-08, + "loss": -0.0437, + "num_tokens": 65551669.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5642359256744385, + "sampling/importance_sampling_ratio/mean": 0.999976634979248, + "sampling/importance_sampling_ratio/min": 0.5727118253707886, + "sampling/sampling_logp_difference/max": 0.5573725700378418, + "sampling/sampling_logp_difference/mean": 0.016054105013608932, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 183.203125, + "completions/mean_terminated_length": 183.203125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3467482924461365, + "epoch": 2.5514705882352944, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.022072626667965, + "kl": 0.1452556550502777, + "learning_rate": 6.78805171691817e-08, + "loss": 0.0233, + "num_tokens": 65584066.0, + "reward": 0.5625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996230602264404, + "sampling/importance_sampling_ratio/min": 0.5155945420265198, + "sampling/sampling_logp_difference/max": 0.7049882411956787, + "sampling/sampling_logp_difference/mean": 0.017521627247333527, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 210.546875, + "completions/mean_terminated_length": 210.546875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3249031603336334, + "epoch": 2.5526960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8277381996354904, + "kl": 0.14773723483085632, + "learning_rate": 6.752257102765324e-08, + "loss": 0.0124, + "num_tokens": 65623157.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.9977896213531494, + "sampling/importance_sampling_ratio/mean": 1.0001258850097656, + "sampling/importance_sampling_ratio/min": 0.5109723210334778, + "sampling/sampling_logp_difference/max": 0.6920413970947266, + "sampling/sampling_logp_difference/mean": 0.017270749434828758, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 266.40625, + "completions/mean_terminated_length": 266.40625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2513664662837982, + "epoch": 2.553921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.629798547570488, + "kl": 0.08482865989208221, + "learning_rate": 6.716550278790739e-08, + "loss": 0.0332, + "num_tokens": 65661583.0, + "reward": 0.1875, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.9655275344848633, + "sampling/importance_sampling_ratio/mean": 0.9994052052497864, + "sampling/importance_sampling_ratio/min": 0.4171328544616699, + "sampling/sampling_logp_difference/max": 0.8743505477905273, + "sampling/sampling_logp_difference/mean": 0.01324361003935337, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 168.625, + "completions/mean_terminated_length": 168.625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2828473448753357, + "epoch": 2.5551470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5135601968951389, + "kl": 0.12092584371566772, + "learning_rate": 6.680931317476996e-08, + "loss": 0.0367, + "num_tokens": 65686951.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 1.0004072189331055, + "sampling/importance_sampling_ratio/min": 0.5685153007507324, + "sampling/sampling_logp_difference/max": 0.5647270679473877, + "sampling/sampling_logp_difference/mean": 0.01583164557814598, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 163.546875, + "completions/mean_terminated_length": 163.546875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3796335458755493, + "epoch": 2.556372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1911367549192553, + "kl": 0.15162840485572815, + "learning_rate": 6.645400291128356e-08, + "loss": 0.0103, + "num_tokens": 65722042.0, + "reward": 0.4375, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5306832790374756, + "sampling/importance_sampling_ratio/mean": 1.0003660917282104, + "sampling/importance_sampling_ratio/min": 0.6159006953239441, + "sampling/sampling_logp_difference/max": 0.4846695065498352, + "sampling/sampling_logp_difference/mean": 0.019542595371603966, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 234.78125, + "completions/mean_terminated_length": 234.78125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.3299919366836548, + "epoch": 2.5575980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5794549794020731, + "kl": 0.086224764585495, + "learning_rate": 6.609957271870503e-08, + "loss": -0.0126, + "num_tokens": 65757164.0, + "reward": 0.875, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6277685165405273, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 0.3673675060272217, + "sampling/sampling_logp_difference/max": 1.0013926029205322, + "sampling/sampling_logp_difference/mean": 0.015792466700077057, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 194.546875, + "completions/mean_terminated_length": 194.546875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.19744019210338593, + "epoch": 2.5588235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028118563136091786, + "kl": 0.053611598908901215, + "learning_rate": 6.574602331650559e-08, + "loss": 0.0005, + "num_tokens": 65785839.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7589657306671143, + "sampling/importance_sampling_ratio/mean": 1.0012531280517578, + "sampling/importance_sampling_ratio/min": 0.5676552653312683, + "sampling/sampling_logp_difference/max": 0.5662409067153931, + "sampling/sampling_logp_difference/mean": 0.01284267008304596, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 163.84375, + "completions/mean_terminated_length": 163.84375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.25681692361831665, + "epoch": 2.560049019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.283364651688548, + "kl": 0.10075172781944275, + "learning_rate": 6.539335542236802e-08, + "loss": 0.0091, + "num_tokens": 65815237.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996932148933411, + "sampling/importance_sampling_ratio/min": 0.5661249160766602, + "sampling/sampling_logp_difference/max": 0.8119585514068604, + "sampling/sampling_logp_difference/mean": 0.015214763581752777, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 191.46875, + "completions/mean_terminated_length": 191.46875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.16612298786640167, + "epoch": 2.561274509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055992535310503404, + "kl": 0.05680924654006958, + "learning_rate": 6.504156975218567e-08, + "loss": 0.0005, + "num_tokens": 65841731.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5558738708496094, + "sampling/importance_sampling_ratio/mean": 0.9993709921836853, + "sampling/importance_sampling_ratio/min": 0.3743380606174469, + "sampling/sampling_logp_difference/max": 0.9825959205627441, + "sampling/sampling_logp_difference/mean": 0.011070625856518745, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.28278350830078125, + "epoch": 2.5625, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7593979467452114, + "kl": 0.07036584615707397, + "learning_rate": 6.469066702006137e-08, + "loss": -0.1094, + "num_tokens": 65873643.0, + "reward": 0.53125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7612903118133545, + "sampling/importance_sampling_ratio/mean": 0.9999133944511414, + "sampling/importance_sampling_ratio/min": 0.2949601709842682, + "sampling/sampling_logp_difference/max": 1.2209149599075317, + "sampling/sampling_logp_difference/mean": 0.01508941687643528, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 196.578125, + "completions/mean_terminated_length": 196.578125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3417613208293915, + "epoch": 2.563725490196078, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.828518664209619, + "kl": 0.10849667340517044, + "learning_rate": 6.43406479383053e-08, + "loss": 0.0076, + "num_tokens": 65901568.0, + "reward": 0.125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.965592861175537, + "sampling/importance_sampling_ratio/mean": 1.00014328956604, + "sampling/importance_sampling_ratio/min": 0.6171426773071289, + "sampling/sampling_logp_difference/max": 0.6757938861846924, + "sampling/sampling_logp_difference/mean": 0.01746380887925625, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 141.234375, + "completions/mean_terminated_length": 141.234375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.19732163846492767, + "epoch": 2.564950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7564960077024474, + "kl": 0.08417364954948425, + "learning_rate": 6.399151321743423e-08, + "loss": 0.0272, + "num_tokens": 65921871.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6207635402679443, + "sampling/importance_sampling_ratio/mean": 0.9992195963859558, + "sampling/importance_sampling_ratio/min": 0.591346025466919, + "sampling/sampling_logp_difference/max": 0.5253539085388184, + "sampling/sampling_logp_difference/mean": 0.014043517410755157, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 179.21875, + "completions/mean_terminated_length": 179.21875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.242929607629776, + "epoch": 2.5661764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3703714253707795, + "kl": 0.07308401167392731, + "learning_rate": 6.364326356616917e-08, + "loss": -0.0153, + "num_tokens": 65958541.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8151217699050903, + "sampling/importance_sampling_ratio/mean": 0.9997493028640747, + "sampling/importance_sampling_ratio/min": 0.45788392424583435, + "sampling/sampling_logp_difference/max": 0.781139612197876, + "sampling/sampling_logp_difference/mean": 0.01518308650702238, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 171.65625, + "completions/mean_terminated_length": 171.65625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.20393404364585876, + "epoch": 2.5674019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044885106882471194, + "kl": 0.076746866106987, + "learning_rate": 6.329589969143517e-08, + "loss": 0.0007, + "num_tokens": 65987543.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9099587202072144, + "sampling/importance_sampling_ratio/mean": 1.0000215768814087, + "sampling/importance_sampling_ratio/min": 0.5633079409599304, + "sampling/sampling_logp_difference/max": 0.6470816135406494, + "sampling/sampling_logp_difference/mean": 0.012605559080839157, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 202.546875, + "completions/mean_terminated_length": 202.546875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.27725207805633545, + "epoch": 2.568627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1611957430125468, + "kl": 0.08803446590900421, + "learning_rate": 6.29494222983587e-08, + "loss": 0.038, + "num_tokens": 66025834.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999281167984009, + "sampling/importance_sampling_ratio/min": 0.3211215138435364, + "sampling/sampling_logp_difference/max": 1.135935664176941, + "sampling/sampling_logp_difference/mean": 0.015521700493991375, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 203.46875, + "completions/mean_terminated_length": 203.46875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.28914710879325867, + "epoch": 2.5698529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2299099073108812, + "kl": 0.09882599115371704, + "learning_rate": 6.260383209026704e-08, + "loss": 0.0148, + "num_tokens": 66059304.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9051291942596436, + "sampling/importance_sampling_ratio/mean": 1.0000487565994263, + "sampling/importance_sampling_ratio/min": 0.5375722050666809, + "sampling/sampling_logp_difference/max": 0.6445498466491699, + "sampling/sampling_logp_difference/mean": 0.015876304358243942, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 203.546875, + "completions/mean_terminated_length": 203.546875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.3735591471195221, + "epoch": 2.571078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7435414209196602, + "kl": 0.1381000429391861, + "learning_rate": 6.225912976868636e-08, + "loss": 0.0536, + "num_tokens": 66092219.0, + "reward": 0.8125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005311965942383, + "sampling/importance_sampling_ratio/min": 0.29176315665245056, + "sampling/sampling_logp_difference/max": 1.2318129539489746, + "sampling/sampling_logp_difference/mean": 0.020947258919477463, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 208.609375, + "completions/mean_terminated_length": 208.609375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.2167380303144455, + "epoch": 2.5723039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4442468872495544, + "kl": 0.11169961839914322, + "learning_rate": 6.191531603334044e-08, + "loss": -0.0223, + "num_tokens": 66120562.0, + "reward": 0.4375, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6635867357254028, + "sampling/importance_sampling_ratio/mean": 1.0005972385406494, + "sampling/importance_sampling_ratio/min": 0.6209225654602051, + "sampling/sampling_logp_difference/max": 0.5089759826660156, + "sampling/sampling_logp_difference/mean": 0.012378549203276634, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 239.234375, + "completions/mean_terminated_length": 239.234375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.20222190022468567, + "epoch": 2.5735294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.239160434612888, + "kl": 0.04628078266978264, + "learning_rate": 6.157239158214966e-08, + "loss": -0.011, + "num_tokens": 66158977.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000662803649902, + "sampling/importance_sampling_ratio/min": 0.4699024558067322, + "sampling/sampling_logp_difference/max": 0.8504469394683838, + "sampling/sampling_logp_difference/mean": 0.012522635981440544, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 190.765625, + "completions/mean_terminated_length": 190.765625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.2047121524810791, + "epoch": 2.5747549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08701466942171444, + "kl": 0.08167508244514465, + "learning_rate": 6.123035711122859e-08, + "loss": 0.0009, + "num_tokens": 66189074.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.649046778678894, + "sampling/importance_sampling_ratio/mean": 0.9988332986831665, + "sampling/importance_sampling_ratio/min": 0.4462607800960541, + "sampling/sampling_logp_difference/max": 0.806851863861084, + "sampling/sampling_logp_difference/mean": 0.01280701719224453, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 209.828125, + "completions/mean_terminated_length": 209.828125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2900751829147339, + "epoch": 2.575980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8466726522520487, + "kl": 0.12072841823101044, + "learning_rate": 6.088921331488566e-08, + "loss": 0.048, + "num_tokens": 66219319.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.7802555561065674, + "sampling/importance_sampling_ratio/mean": 0.9999192953109741, + "sampling/importance_sampling_ratio/min": 0.4764772057533264, + "sampling/sampling_logp_difference/max": 0.741335391998291, + "sampling/sampling_logp_difference/mean": 0.015741858631372452, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 174.03125, + "completions/mean_terminated_length": 174.03125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.1932278871536255, + "epoch": 2.577205882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8835831160264318, + "kl": 0.07695497572422028, + "learning_rate": 6.05489608856214e-08, + "loss": 0.0817, + "num_tokens": 66247353.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.848081111907959, + "sampling/importance_sampling_ratio/mean": 0.9999405145645142, + "sampling/importance_sampling_ratio/min": 0.48282214999198914, + "sampling/sampling_logp_difference/max": 0.7281069159507751, + "sampling/sampling_logp_difference/mean": 0.012569449841976166, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 188.6875, + "completions/mean_terminated_length": 188.6875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.23454561829566956, + "epoch": 2.5784313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08303032723481436, + "kl": 0.09593294560909271, + "learning_rate": 6.020960051412638e-08, + "loss": 0.0009, + "num_tokens": 66276421.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000700950622559, + "sampling/importance_sampling_ratio/min": 0.5919349193572998, + "sampling/sampling_logp_difference/max": 0.7266454696655273, + "sampling/sampling_logp_difference/mean": 0.01347368024289608, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 194.78125, + "completions/mean_terminated_length": 194.78125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.2750077247619629, + "epoch": 2.579656862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9504997349472788, + "kl": 0.10330727696418762, + "learning_rate": 5.98711328892808e-08, + "loss": 0.0622, + "num_tokens": 66307863.0, + "reward": 0.65625, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.458220362663269, + "sampling/importance_sampling_ratio/mean": 0.9996346235275269, + "sampling/importance_sampling_ratio/min": 0.416096568107605, + "sampling/sampling_logp_difference/max": 0.8768379092216492, + "sampling/sampling_logp_difference/mean": 0.015241402201354504, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 213.375, + "completions/mean_terminated_length": 213.375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.30214518308639526, + "epoch": 2.5808823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.806112724407539, + "kl": 0.1500733196735382, + "learning_rate": 5.9533558698152355e-08, + "loss": -0.0382, + "num_tokens": 66340031.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5259431600570679, + "sampling/importance_sampling_ratio/mean": 0.9996534585952759, + "sampling/importance_sampling_ratio/min": 0.5610695481300354, + "sampling/sampling_logp_difference/max": 0.5779104232788086, + "sampling/sampling_logp_difference/mean": 0.015558110550045967, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 259.140625, + "completions/mean_terminated_length": 259.140625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.31714487075805664, + "epoch": 2.582107843137255, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9515772424116236, + "kl": 0.1299181431531906, + "learning_rate": 5.919687862599548e-08, + "loss": 0.0353, + "num_tokens": 66375448.0, + "reward": 0.71875, + "reward_std": 0.6751632690429688, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5271198749542236, + "sampling/importance_sampling_ratio/mean": 0.9996883869171143, + "sampling/importance_sampling_ratio/min": 0.5868014693260193, + "sampling/sampling_logp_difference/max": 0.5330686569213867, + "sampling/sampling_logp_difference/mean": 0.015624091029167175, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 173.65625, + "completions/mean_terminated_length": 173.65625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2470850646495819, + "epoch": 2.5833333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045180226644925776, + "kl": 0.08131169527769089, + "learning_rate": 5.886109335624928e-08, + "loss": 0.0008, + "num_tokens": 66406498.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6108070611953735, + "sampling/importance_sampling_ratio/mean": 0.9997807741165161, + "sampling/importance_sampling_ratio/min": 0.6077000498771667, + "sampling/sampling_logp_difference/max": 0.4980738162994385, + "sampling/sampling_logp_difference/mean": 0.01548383105546236, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 135.265625, + "completions/mean_terminated_length": 135.265625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.17381243407726288, + "epoch": 2.5845588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07604536003009166, + "kl": 0.06403857469558716, + "learning_rate": 5.8526203570536504e-08, + "loss": 0.0006, + "num_tokens": 66427587.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.574164867401123, + "sampling/importance_sampling_ratio/mean": 1.0000239610671997, + "sampling/importance_sampling_ratio/min": 0.5934368968009949, + "sampling/sampling_logp_difference/max": 0.5218243598937988, + "sampling/sampling_logp_difference/mean": 0.012833647429943085, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 135.96875, + "completions/mean_terminated_length": 135.96875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.22191357612609863, + "epoch": 2.5857843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059451595488596576, + "kl": 0.0778200775384903, + "learning_rate": 5.819220994866236e-08, + "loss": 0.0008, + "num_tokens": 66451377.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5200186967849731, + "sampling/importance_sampling_ratio/mean": 0.9999660849571228, + "sampling/importance_sampling_ratio/min": 0.6176431775093079, + "sampling/sampling_logp_difference/max": 0.481844425201416, + "sampling/sampling_logp_difference/mean": 0.013643322512507439, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 167.1875, + "completions/mean_terminated_length": 167.1875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.25612854957580566, + "epoch": 2.5870098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.470913918959459, + "kl": 0.08519189059734344, + "learning_rate": 5.7859113168612696e-08, + "loss": 0.0171, + "num_tokens": 66480445.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.838975429534912, + "sampling/importance_sampling_ratio/mean": 0.9992935657501221, + "sampling/importance_sampling_ratio/min": 0.5243265628814697, + "sampling/sampling_logp_difference/max": 0.6456406116485596, + "sampling/sampling_logp_difference/mean": 0.014908654615283012, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 221.515625, + "completions/mean_terminated_length": 221.515625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.24222581088542938, + "epoch": 2.588235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04218740059821595, + "kl": 0.054951492697000504, + "learning_rate": 5.7526913906552786e-08, + "loss": 0.0006, + "num_tokens": 66520142.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999772846698761, + "sampling/importance_sampling_ratio/min": 0.4225720167160034, + "sampling/sampling_logp_difference/max": 0.8613954186439514, + "sampling/sampling_logp_difference/mean": 0.015231235884130001, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 248.265625, + "completions/mean_terminated_length": 248.265625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.2756633162498474, + "epoch": 2.5894607843137254, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8832164839881007, + "kl": 0.08400071412324905, + "learning_rate": 5.7195612836826055e-08, + "loss": 0.0005, + "num_tokens": 66553407.0, + "reward": 0.125, + "reward_std": 0.6311737298965454, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6103041172027588, + "sampling/importance_sampling_ratio/mean": 1.0004600286483765, + "sampling/importance_sampling_ratio/min": 0.6165305376052856, + "sampling/sampling_logp_difference/max": 0.48364734649658203, + "sampling/sampling_logp_difference/mean": 0.013941222801804543, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 184.453125, + "completions/mean_terminated_length": 184.453125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3001672625541687, + "epoch": 2.590686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1600929794595727, + "kl": 0.09254103899002075, + "learning_rate": 5.686521063195287e-08, + "loss": -0.0046, + "num_tokens": 66583676.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.633475422859192, + "sampling/importance_sampling_ratio/mean": 0.9997601509094238, + "sampling/importance_sampling_ratio/min": 0.5941472053527832, + "sampling/sampling_logp_difference/max": 0.5206282138824463, + "sampling/sampling_logp_difference/mean": 0.01590839773416519, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 290.515625, + "completions/mean_terminated_length": 290.515625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.19757306575775146, + "epoch": 2.5919117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8962636414866241, + "kl": 0.07685651630163193, + "learning_rate": 5.6535707962628685e-08, + "loss": 0.0426, + "num_tokens": 66625757.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6202460527420044, + "sampling/importance_sampling_ratio/mean": 1.0001763105392456, + "sampling/importance_sampling_ratio/min": 0.1880382001399994, + "sampling/sampling_logp_difference/max": 1.6711101531982422, + "sampling/sampling_logp_difference/mean": 0.012390850111842155, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 159.40625, + "completions/mean_terminated_length": 159.40625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.269588440656662, + "epoch": 2.593137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0103775455000308, + "kl": 0.10725459456443787, + "learning_rate": 5.620710549772295e-08, + "loss": 0.0092, + "num_tokens": 66658983.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6176186800003052, + "sampling/importance_sampling_ratio/mean": 0.9991856813430786, + "sampling/importance_sampling_ratio/min": 0.1900569498538971, + "sampling/sampling_logp_difference/max": 1.6604315042495728, + "sampling/sampling_logp_difference/mean": 0.016164135187864304, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 154.875, + "completions/mean_terminated_length": 154.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2638397812843323, + "epoch": 2.594362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1614219117302582, + "kl": 0.10906018316745758, + "learning_rate": 5.5879403904278034e-08, + "loss": -0.0004, + "num_tokens": 66683711.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.656031608581543, + "sampling/importance_sampling_ratio/mean": 1.000084638595581, + "sampling/importance_sampling_ratio/min": 0.5363211631774902, + "sampling/sampling_logp_difference/max": 0.6230220794677734, + "sampling/sampling_logp_difference/mean": 0.015072275884449482, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 171.96875, + "completions/mean_terminated_length": 171.96875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.2775786221027374, + "epoch": 2.5955882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6394531574709035, + "kl": 0.13202057778835297, + "learning_rate": 5.555260384750721e-08, + "loss": -0.0068, + "num_tokens": 66710525.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8529046773910522, + "sampling/importance_sampling_ratio/mean": 0.9998239278793335, + "sampling/importance_sampling_ratio/min": 0.5499146580696106, + "sampling/sampling_logp_difference/max": 0.6167545318603516, + "sampling/sampling_logp_difference/mean": 0.01635042577981949, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 204.78125, + "completions/mean_terminated_length": 204.78125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.22385014593601227, + "epoch": 2.596813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06418912554396708, + "kl": 0.07331070303916931, + "learning_rate": 5.5226705990794156e-08, + "loss": 0.0008, + "num_tokens": 66747983.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6319586038589478, + "sampling/importance_sampling_ratio/mean": 1.0003552436828613, + "sampling/importance_sampling_ratio/min": 0.5362968444824219, + "sampling/sampling_logp_difference/max": 0.6230674982070923, + "sampling/sampling_logp_difference/mean": 0.014345422387123108, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 221.546875, + "completions/mean_terminated_length": 221.546875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.2747361958026886, + "epoch": 2.5980392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.686704366115504, + "kl": 0.08461602032184601, + "learning_rate": 5.4901710995690576e-08, + "loss": -0.0303, + "num_tokens": 66780162.0, + "reward": 0.21875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.770574927330017, + "sampling/importance_sampling_ratio/mean": 0.9999843835830688, + "sampling/importance_sampling_ratio/min": 0.623648464679718, + "sampling/sampling_logp_difference/max": 0.5713043212890625, + "sampling/sampling_logp_difference/mean": 0.015064763836562634, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 197.3125, + "completions/mean_terminated_length": 197.3125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.26871490478515625, + "epoch": 2.599264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0842918697112692, + "kl": 0.09573175013065338, + "learning_rate": 5.4577619521915916e-08, + "loss": 0.0009, + "num_tokens": 66811302.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5764344930648804, + "sampling/importance_sampling_ratio/mean": 1.0004682540893555, + "sampling/importance_sampling_ratio/min": 0.5224652290344238, + "sampling/sampling_logp_difference/max": 0.6491968631744385, + "sampling/sampling_logp_difference/mean": 0.01558225229382515, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 200.390625, + "completions/mean_terminated_length": 200.390625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.25763362646102905, + "epoch": 2.6004901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9315214618315621, + "kl": 0.10824404656887054, + "learning_rate": 5.425443222735526e-08, + "loss": -0.02, + "num_tokens": 66839839.0, + "reward": 0.15625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5747989416122437, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.5106753706932068, + "sampling/sampling_logp_difference/max": 0.6720211505889893, + "sampling/sampling_logp_difference/mean": 0.014287374913692474, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 205.09375, + "completions/mean_terminated_length": 205.09375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3279130756855011, + "epoch": 2.6017156862745097, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9649989694015835, + "kl": 0.0774766206741333, + "learning_rate": 5.393214976805832e-08, + "loss": -0.0067, + "num_tokens": 66875301.0, + "reward": 0.15625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5310070514678955, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 0.6203033924102783, + "sampling/sampling_logp_difference/max": 0.47754669189453125, + "sampling/sampling_logp_difference/mean": 0.018412724137306213, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 176.765625, + "completions/mean_terminated_length": 176.765625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.28056013584136963, + "epoch": 2.6029411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0256661287488673, + "kl": 0.08669671416282654, + "learning_rate": 5.361077279823817e-08, + "loss": -0.0208, + "num_tokens": 66903190.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5671733617782593, + "sampling/importance_sampling_ratio/mean": 0.9998500347137451, + "sampling/importance_sampling_ratio/min": 0.5264557003974915, + "sampling/sampling_logp_difference/max": 0.6415880918502808, + "sampling/sampling_logp_difference/mean": 0.015238385647535324, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 266.203125, + "completions/mean_terminated_length": 266.203125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.24432271718978882, + "epoch": 2.6041666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0124809033525424, + "kl": 0.07161411643028259, + "learning_rate": 5.3290301970269514e-08, + "loss": 0.0092, + "num_tokens": 66937155.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007449388504028, + "sampling/importance_sampling_ratio/min": 0.47952455282211304, + "sampling/sampling_logp_difference/max": 0.7349601984024048, + "sampling/sampling_logp_difference/mean": 0.014381598681211472, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 161.59375, + "completions/mean_terminated_length": 161.59375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.24809454381465912, + "epoch": 2.605392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3918587831956422, + "kl": 0.0901394635438919, + "learning_rate": 5.29707379346882e-08, + "loss": 0.0029, + "num_tokens": 66962969.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4687564373016357, + "sampling/importance_sampling_ratio/mean": 0.9996079206466675, + "sampling/importance_sampling_ratio/min": 0.37306836247444153, + "sampling/sampling_logp_difference/max": 0.9859936237335205, + "sampling/sampling_logp_difference/mean": 0.015450348146259785, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 166.859375, + "completions/mean_terminated_length": 166.859375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.23761391639709473, + "epoch": 2.6066176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05433803802921959, + "kl": 0.07408708333969116, + "learning_rate": 5.2652081340188506e-08, + "loss": 0.0007, + "num_tokens": 66992624.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6512004137039185, + "sampling/importance_sampling_ratio/mean": 1.0000839233398438, + "sampling/importance_sampling_ratio/min": 0.3969041109085083, + "sampling/sampling_logp_difference/max": 0.924060583114624, + "sampling/sampling_logp_difference/mean": 0.014247927814722061, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 206.03125, + "completions/mean_terminated_length": 206.03125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.22145050764083862, + "epoch": 2.607843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.168122173907982, + "kl": 0.08317376673221588, + "learning_rate": 5.2334332833623487e-08, + "loss": -0.0355, + "num_tokens": 67026194.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001128911972046, + "sampling/importance_sampling_ratio/min": 0.5911355018615723, + "sampling/sampling_logp_difference/max": 0.7048373222351074, + "sampling/sampling_logp_difference/mean": 0.013085026293992996, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 227.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.32461151480674744, + "epoch": 2.6090686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1978703401036495, + "kl": 0.10807499289512634, + "learning_rate": 5.2017493060002196e-08, + "loss": 0.0122, + "num_tokens": 67058466.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5546787977218628, + "sampling/importance_sampling_ratio/mean": 0.9997225999832153, + "sampling/importance_sampling_ratio/min": 0.5561249256134033, + "sampling/sampling_logp_difference/max": 0.5867623090744019, + "sampling/sampling_logp_difference/mean": 0.017366349697113037, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 201.125, + "completions/mean_terminated_length": 201.125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.24303880333900452, + "epoch": 2.610294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0506152082471976, + "kl": 0.07377795875072479, + "learning_rate": 5.1701562662489596e-08, + "loss": 0.0317, + "num_tokens": 67093034.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004481077194214, + "sampling/importance_sampling_ratio/min": 0.14816617965698242, + "sampling/sampling_logp_difference/max": 1.9094208478927612, + "sampling/sampling_logp_difference/mean": 0.015139998868107796, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 238.328125, + "completions/mean_terminated_length": 238.328125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.27354949712753296, + "epoch": 2.611519607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2904571953305226, + "kl": 0.09254096448421478, + "learning_rate": 5.138654228240424e-08, + "loss": 0.0042, + "num_tokens": 67126751.0, + "reward": 0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5467699766159058, + "sampling/importance_sampling_ratio/mean": 0.9996681809425354, + "sampling/importance_sampling_ratio/min": 0.5174968838691711, + "sampling/sampling_logp_difference/max": 0.6587517261505127, + "sampling/sampling_logp_difference/mean": 0.01584434136748314, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 164.65625, + "completions/mean_terminated_length": 164.65625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.24980810284614563, + "epoch": 2.6127450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9901589062946016, + "kl": 0.08331011980772018, + "learning_rate": 5.1072432559217446e-08, + "loss": 0.0142, + "num_tokens": 67156057.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8933290243148804, + "sampling/importance_sampling_ratio/mean": 0.9995639324188232, + "sampling/importance_sampling_ratio/min": 0.5046089291572571, + "sampling/sampling_logp_difference/max": 0.683971643447876, + "sampling/sampling_logp_difference/mean": 0.015042290091514587, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 194.5, + "completions/mean_terminated_length": 194.5, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.24290236830711365, + "epoch": 2.6139705882352944, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8874488386936594, + "kl": 0.12486511468887329, + "learning_rate": 5.075923413055222e-08, + "loss": 0.03, + "num_tokens": 67184825.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.856454849243164, + "sampling/importance_sampling_ratio/mean": 1.0002044439315796, + "sampling/importance_sampling_ratio/min": 0.6065518856048584, + "sampling/sampling_logp_difference/max": 0.6186686754226685, + "sampling/sampling_logp_difference/mean": 0.014220327138900757, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 174.953125, + "completions/mean_terminated_length": 174.953125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.23684829473495483, + "epoch": 2.6151960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6676110255035501, + "kl": 0.10212721675634384, + "learning_rate": 5.044694763218149e-08, + "loss": -0.0134, + "num_tokens": 67212182.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5247656106948853, + "sampling/importance_sampling_ratio/mean": 1.0001076459884644, + "sampling/importance_sampling_ratio/min": 0.6131953001022339, + "sampling/sampling_logp_difference/max": 0.4890718460083008, + "sampling/sampling_logp_difference/mean": 0.01379355788230896, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 218.296875, + "completions/mean_terminated_length": 218.296875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.20745372772216797, + "epoch": 2.616421568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7549346864798054, + "kl": 0.060086145997047424, + "learning_rate": 5.013557369802701e-08, + "loss": 0.0199, + "num_tokens": 67243673.0, + "reward": 0.6875, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005900859832764, + "sampling/importance_sampling_ratio/min": 0.26832762360572815, + "sampling/sampling_logp_difference/max": 1.3155465126037598, + "sampling/sampling_logp_difference/mean": 0.012244252488017082, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 176.140625, + "completions/mean_terminated_length": 176.140625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.22691315412521362, + "epoch": 2.6176470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05290554143748817, + "kl": 0.06415960937738419, + "learning_rate": 4.982511296015807e-08, + "loss": 0.0006, + "num_tokens": 67270418.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9223034381866455, + "sampling/importance_sampling_ratio/mean": 0.999817430973053, + "sampling/importance_sampling_ratio/min": 0.6110133528709412, + "sampling/sampling_logp_difference/max": 0.6535241603851318, + "sampling/sampling_logp_difference/mean": 0.014248640276491642, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 187.703125, + "completions/mean_terminated_length": 187.703125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.276864230632782, + "epoch": 2.618872549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3544910011995401, + "kl": 0.08870755136013031, + "learning_rate": 4.951556604879048e-08, + "loss": 0.0364, + "num_tokens": 67298975.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005528926849365, + "sampling/importance_sampling_ratio/min": 0.4715724587440491, + "sampling/sampling_logp_difference/max": 0.7657084465026855, + "sampling/sampling_logp_difference/mean": 0.016868922859430313, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 182.9375, + "completions/mean_terminated_length": 182.9375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.2801469564437866, + "epoch": 2.6200980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0659936887142005, + "kl": 0.1408994495868683, + "learning_rate": 4.9206933592284725e-08, + "loss": -0.0189, + "num_tokens": 67335675.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9737051725387573, + "sampling/importance_sampling_ratio/mean": 1.0005115270614624, + "sampling/importance_sampling_ratio/min": 0.2446073442697525, + "sampling/sampling_logp_difference/max": 1.4081010818481445, + "sampling/sampling_logp_difference/mean": 0.01681932434439659, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 212.640625, + "completions/mean_terminated_length": 212.640625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.22469255328178406, + "epoch": 2.6213235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.378163899528437, + "kl": 0.04773491621017456, + "learning_rate": 4.889921621714516e-08, + "loss": 0.0746, + "num_tokens": 67374036.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5277727842330933, + "sampling/importance_sampling_ratio/mean": 0.999775230884552, + "sampling/importance_sampling_ratio/min": 0.30699262022972107, + "sampling/sampling_logp_difference/max": 1.180931568145752, + "sampling/sampling_logp_difference/mean": 0.012799415737390518, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 197.40625, + "completions/mean_terminated_length": 197.40625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.28447532653808594, + "epoch": 2.622549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1631593950910148, + "kl": 0.106813944876194, + "learning_rate": 4.859241454801866e-08, + "loss": -0.01, + "num_tokens": 67407550.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.7253344058990479, + "sampling/importance_sampling_ratio/mean": 0.9998149871826172, + "sampling/importance_sampling_ratio/min": 0.4956277906894684, + "sampling/sampling_logp_difference/max": 0.701930046081543, + "sampling/sampling_logp_difference/mean": 0.015472184866666794, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 206.359375, + "completions/mean_terminated_length": 206.359375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.1956772357225418, + "epoch": 2.623774509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.050202017905082, + "kl": 0.07657797634601593, + "learning_rate": 4.828652920769311e-08, + "loss": -0.0344, + "num_tokens": 67439045.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004371404647827, + "sampling/importance_sampling_ratio/min": 0.41190850734710693, + "sampling/sampling_logp_difference/max": 0.8869540691375732, + "sampling/sampling_logp_difference/mean": 0.012367982417345047, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 160.453125, + "completions/mean_terminated_length": 160.453125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.1892467588186264, + "epoch": 2.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06270786251956269, + "kl": 0.06768421828746796, + "learning_rate": 4.7981560817096366e-08, + "loss": 0.0007, + "num_tokens": 67466226.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6377545595169067, + "sampling/importance_sampling_ratio/mean": 0.9995848536491394, + "sampling/importance_sampling_ratio/min": 0.6116840243339539, + "sampling/sampling_logp_difference/max": 0.49332618713378906, + "sampling/sampling_logp_difference/mean": 0.01251022145152092, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 145.671875, + "completions/mean_terminated_length": 145.671875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.21630370616912842, + "epoch": 2.626225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8027880141909465, + "kl": 0.09206929802894592, + "learning_rate": 4.767750999529485e-08, + "loss": -0.0267, + "num_tokens": 67489805.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.8722227811813354, + "sampling/importance_sampling_ratio/mean": 0.999727189540863, + "sampling/importance_sampling_ratio/min": 0.6369261741638184, + "sampling/sampling_logp_difference/max": 0.6271264553070068, + "sampling/sampling_logp_difference/mean": 0.01369383092969656, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 198.625, + "completions/mean_terminated_length": 198.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.18437042832374573, + "epoch": 2.627450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05353737510970359, + "kl": 0.06574147939682007, + "learning_rate": 4.7374377359492624e-08, + "loss": 0.0006, + "num_tokens": 67520741.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999595582485199, + "sampling/importance_sampling_ratio/min": 0.40562233328819275, + "sampling/sampling_logp_difference/max": 0.9023327827453613, + "sampling/sampling_logp_difference/mean": 0.012694522738456726, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 197.84375, + "completions/mean_terminated_length": 197.84375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.2577444911003113, + "epoch": 2.6286764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4293126999214278, + "kl": 0.08104147017002106, + "learning_rate": 4.707216352502974e-08, + "loss": 0.0182, + "num_tokens": 67550139.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6033425331115723, + "sampling/importance_sampling_ratio/mean": 0.9998073577880859, + "sampling/importance_sampling_ratio/min": 0.5484043955802917, + "sampling/sampling_logp_difference/max": 0.6007423400878906, + "sampling/sampling_logp_difference/mean": 0.015712305903434753, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 177.921875, + "completions/mean_terminated_length": 177.921875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.360873818397522, + "epoch": 2.6299019607843137, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.671935418846424, + "kl": 0.14488419890403748, + "learning_rate": 4.6770869105380914e-08, + "loss": 0.0015, + "num_tokens": 67583798.0, + "reward": 0.40625, + "reward_std": 0.8327301740646362, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.8791263103485107, + "sampling/importance_sampling_ratio/mean": 0.9996519684791565, + "sampling/importance_sampling_ratio/min": 0.5849604606628418, + "sampling/sampling_logp_difference/max": 0.6308069229125977, + "sampling/sampling_logp_difference/mean": 0.018338389694690704, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 179.234375, + "completions/mean_terminated_length": 179.234375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.17377278208732605, + "epoch": 2.631127450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2666293172358711, + "kl": 0.060918159782886505, + "learning_rate": 4.647049471215497e-08, + "loss": 0.0163, + "num_tokens": 67613397.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4857426881790161, + "sampling/importance_sampling_ratio/mean": 0.9993095993995667, + "sampling/importance_sampling_ratio/min": 0.6075646281242371, + "sampling/sampling_logp_difference/max": 0.49829673767089844, + "sampling/sampling_logp_difference/mean": 0.011565905064344406, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 165.109375, + "completions/mean_terminated_length": 165.109375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.2292921543121338, + "epoch": 2.6323529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04980462181107673, + "kl": 0.09878659248352051, + "learning_rate": 4.6171040955092835e-08, + "loss": 0.0009, + "num_tokens": 67640044.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6528207063674927, + "sampling/importance_sampling_ratio/mean": 0.9994211196899414, + "sampling/importance_sampling_ratio/min": 0.6100565791130066, + "sampling/sampling_logp_difference/max": 0.5024833679199219, + "sampling/sampling_logp_difference/mean": 0.013480335474014282, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 186.140625, + "completions/mean_terminated_length": 186.140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.21973447501659393, + "epoch": 2.633578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04874730038388227, + "kl": 0.05952436476945877, + "learning_rate": 4.587250844206664e-08, + "loss": 0.0006, + "num_tokens": 67669461.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.80014169216156, + "sampling/importance_sampling_ratio/mean": 1.0001519918441772, + "sampling/importance_sampling_ratio/min": 0.5886259078979492, + "sampling/sampling_logp_difference/max": 0.5878653526306152, + "sampling/sampling_logp_difference/mean": 0.013445817865431309, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 178.0, + "completions/mean_terminated_length": 178.0, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.23014162480831146, + "epoch": 2.6348039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053334673910715065, + "kl": 0.08652716875076294, + "learning_rate": 4.557489777907836e-08, + "loss": 0.0008, + "num_tokens": 67696789.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5971620082855225, + "sampling/importance_sampling_ratio/mean": 0.999606728553772, + "sampling/importance_sampling_ratio/min": 0.6171416640281677, + "sampling/sampling_logp_difference/max": 0.48265671730041504, + "sampling/sampling_logp_difference/mean": 0.012588806450366974, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 270.203125, + "completions/mean_terminated_length": 270.203125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.1745699644088745, + "epoch": 2.6360294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05844377521284558, + "kl": 0.07397204637527466, + "learning_rate": 4.527820957025891e-08, + "loss": 0.0006, + "num_tokens": 67733314.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998288750648499, + "sampling/importance_sampling_ratio/min": 0.5096697211265564, + "sampling/sampling_logp_difference/max": 0.8388676643371582, + "sampling/sampling_logp_difference/mean": 0.010835763067007065, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 163.21875, + "completions/mean_terminated_length": 163.21875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2286285012960434, + "epoch": 2.6372549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3840725376123506, + "kl": 0.10957164317369461, + "learning_rate": 4.498244441786675e-08, + "loss": 0.004, + "num_tokens": 67759872.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6009145975112915, + "sampling/importance_sampling_ratio/mean": 0.9996766448020935, + "sampling/importance_sampling_ratio/min": 0.6485008001327515, + "sampling/sampling_logp_difference/max": 0.47057509422302246, + "sampling/sampling_logp_difference/mean": 0.013533113524317741, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 232.125, + "completions/mean_terminated_length": 232.125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.25470179319381714, + "epoch": 2.638480392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4633047084419286, + "kl": 0.11641483008861542, + "learning_rate": 4.4687602922286016e-08, + "loss": 0.0178, + "num_tokens": 67792104.0, + "reward": -0.1875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.9625647068023682, + "sampling/importance_sampling_ratio/mean": 1.0000993013381958, + "sampling/importance_sampling_ratio/min": 0.3522830605506897, + "sampling/sampling_logp_difference/max": 1.0433202981948853, + "sampling/sampling_logp_difference/mean": 0.015565533190965652, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 206.921875, + "completions/mean_terminated_length": 206.921875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.28980010747909546, + "epoch": 2.639705882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6208375507681307, + "kl": 0.11233049631118774, + "learning_rate": 4.4393685682026505e-08, + "loss": 0.003, + "num_tokens": 67827971.0, + "reward": 0.3125, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5966885089874268, + "sampling/importance_sampling_ratio/mean": 1.0005667209625244, + "sampling/importance_sampling_ratio/min": 0.5170525312423706, + "sampling/sampling_logp_difference/max": 0.6596107482910156, + "sampling/sampling_logp_difference/mean": 0.015001502819359303, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 182.0625, + "completions/mean_terminated_length": 182.0625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.29670900106430054, + "epoch": 2.6409313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050815895260017105, + "kl": 0.109046071767807, + "learning_rate": 4.4100693293721516e-08, + "loss": 0.0011, + "num_tokens": 67855511.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7177762985229492, + "sampling/importance_sampling_ratio/mean": 0.9998449087142944, + "sampling/importance_sampling_ratio/min": 0.41617515683174133, + "sampling/sampling_logp_difference/max": 0.876649022102356, + "sampling/sampling_logp_difference/mean": 0.016118215397000313, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 198.421875, + "completions/mean_terminated_length": 198.421875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.31891539692878723, + "epoch": 2.642156862745098, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.066544584098449, + "kl": 0.14812368154525757, + "learning_rate": 4.3808626352127066e-08, + "loss": 0.0771, + "num_tokens": 67887218.0, + "reward": 0.25, + "reward_std": 0.5879635810852051, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.8718515634536743, + "sampling/importance_sampling_ratio/mean": 1.0007492303848267, + "sampling/importance_sampling_ratio/min": 0.2254074215888977, + "sampling/sampling_logp_difference/max": 1.4898457527160645, + "sampling/sampling_logp_difference/mean": 0.017643041908740997, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 189.484375, + "completions/mean_terminated_length": 189.484375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.24129286408424377, + "epoch": 2.6433823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2105055067435708, + "kl": 0.08592385798692703, + "learning_rate": 4.351748545012057e-08, + "loss": 0.0148, + "num_tokens": 67916353.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996558427810669, + "sampling/importance_sampling_ratio/min": 0.4141593277454376, + "sampling/sampling_logp_difference/max": 0.8815045356750488, + "sampling/sampling_logp_difference/mean": 0.01417774148285389, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 160.453125, + "completions/mean_terminated_length": 160.453125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.18149279057979584, + "epoch": 2.644607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039533652847857564, + "kl": 0.08074481785297394, + "learning_rate": 4.322727117869951e-08, + "loss": 0.0007, + "num_tokens": 67946542.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5744246244430542, + "sampling/importance_sampling_ratio/mean": 1.0003111362457275, + "sampling/importance_sampling_ratio/min": 0.4830726981163025, + "sampling/sampling_logp_difference/max": 0.7275881767272949, + "sampling/sampling_logp_difference/mean": 0.013689331710338593, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 171.734375, + "completions/mean_terminated_length": 171.734375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.22633984684944153, + "epoch": 2.6458333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12284008384429417, + "kl": 0.12330029159784317, + "learning_rate": 4.2937984126980686e-08, + "loss": 0.0011, + "num_tokens": 67972701.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4916808605194092, + "sampling/importance_sampling_ratio/mean": 1.0010933876037598, + "sampling/importance_sampling_ratio/min": 0.4786601662635803, + "sampling/sampling_logp_difference/max": 0.7367644309997559, + "sampling/sampling_logp_difference/mean": 0.012793360278010368, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 195.65625, + "completions/mean_terminated_length": 195.65625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3101569414138794, + "epoch": 2.6470588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.19164975738539, + "kl": 0.11972618848085403, + "learning_rate": 4.2649624882198196e-08, + "loss": -0.0011, + "num_tokens": 68005511.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999502301216125, + "sampling/importance_sampling_ratio/min": 0.5634000897407532, + "sampling/sampling_logp_difference/max": 0.7547166347503662, + "sampling/sampling_logp_difference/mean": 0.017234966158866882, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 150.84375, + "completions/mean_terminated_length": 150.84375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.290639191865921, + "epoch": 2.6482843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08913893913290491, + "kl": 0.16944536566734314, + "learning_rate": 4.2362194029703256e-08, + "loss": 0.0018, + "num_tokens": 68030029.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.967354655265808, + "sampling/importance_sampling_ratio/mean": 1.0003464221954346, + "sampling/importance_sampling_ratio/min": 0.42517244815826416, + "sampling/sampling_logp_difference/max": 0.8552604913711548, + "sampling/sampling_logp_difference/mean": 0.01711418852210045, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 184.828125, + "completions/mean_terminated_length": 184.828125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2637135982513428, + "epoch": 2.6495098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2258549695200438, + "kl": 0.1391540765762329, + "learning_rate": 4.207569215296214e-08, + "loss": 0.0077, + "num_tokens": 68059634.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4652819633483887, + "sampling/importance_sampling_ratio/mean": 0.9994097948074341, + "sampling/importance_sampling_ratio/min": 0.6319909691810608, + "sampling/sampling_logp_difference/max": 0.4588801860809326, + "sampling/sampling_logp_difference/mean": 0.014466384425759315, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 212.84375, + "completions/mean_terminated_length": 212.84375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.1459178626537323, + "epoch": 2.650735294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02709128132491764, + "kl": 0.027253184467554092, + "learning_rate": 4.179011983355568e-08, + "loss": 0.0003, + "num_tokens": 68098072.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7416507005691528, + "sampling/importance_sampling_ratio/mean": 1.0003209114074707, + "sampling/importance_sampling_ratio/min": 0.3821437656879425, + "sampling/sampling_logp_difference/max": 0.9619584083557129, + "sampling/sampling_logp_difference/mean": 0.010496910661458969, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 196.390625, + "completions/mean_terminated_length": 196.390625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.21812394261360168, + "epoch": 2.6519607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0931667172856299, + "kl": 0.07285928726196289, + "learning_rate": 4.150547765117746e-08, + "loss": -0.0063, + "num_tokens": 68126257.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5161488056182861, + "sampling/importance_sampling_ratio/mean": 0.9997991919517517, + "sampling/importance_sampling_ratio/min": 0.5910096764564514, + "sampling/sampling_logp_difference/max": 0.5259228944778442, + "sampling/sampling_logp_difference/mean": 0.012680470943450928, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 203.5, + "completions/mean_terminated_length": 203.5, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.21613028645515442, + "epoch": 2.653186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5038236539469443, + "kl": 0.07153953611850739, + "learning_rate": 4.1221766183633045e-08, + "loss": -0.009, + "num_tokens": 68162817.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4761751890182495, + "sampling/importance_sampling_ratio/mean": 1.0000369548797607, + "sampling/importance_sampling_ratio/min": 0.4108840227127075, + "sampling/sampling_logp_difference/max": 0.8894443511962891, + "sampling/sampling_logp_difference/mean": 0.013531733304262161, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 143.65625, + "completions/mean_terminated_length": 143.65625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.16796152293682098, + "epoch": 2.6544117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056318582333555633, + "kl": 0.05783125013113022, + "learning_rate": 4.0938986006838926e-08, + "loss": 0.0006, + "num_tokens": 68187451.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6359341144561768, + "sampling/importance_sampling_ratio/mean": 0.9998639822006226, + "sampling/importance_sampling_ratio/min": 0.6097320318222046, + "sampling/sampling_logp_difference/max": 0.4947357177734375, + "sampling/sampling_logp_difference/mean": 0.010606948286294937, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 199.25, + "completions/mean_terminated_length": 199.25, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2929326891899109, + "epoch": 2.655637254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2168245171692678, + "kl": 0.14285142719745636, + "learning_rate": 4.065713769482082e-08, + "loss": 0.0052, + "num_tokens": 68218891.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998737573623657, + "sampling/importance_sampling_ratio/min": 0.46241626143455505, + "sampling/sampling_logp_difference/max": 0.7988262176513672, + "sampling/sampling_logp_difference/mean": 0.016140636056661606, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 155.515625, + "completions/mean_terminated_length": 155.515625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2612577974796295, + "epoch": 2.656862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.547020456094274, + "kl": 0.09141572564840317, + "learning_rate": 4.037622181971295e-08, + "loss": 0.0108, + "num_tokens": 68245932.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008686780929565, + "sampling/importance_sampling_ratio/min": 0.5016920566558838, + "sampling/sampling_logp_difference/max": 0.9166195392608643, + "sampling/sampling_logp_difference/mean": 0.015860222280025482, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 187.09375, + "completions/mean_terminated_length": 187.09375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.30533725023269653, + "epoch": 2.6580882352941178, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1363831956584387, + "kl": 0.15566954016685486, + "learning_rate": 4.009623895175662e-08, + "loss": 0.032, + "num_tokens": 68275042.0, + "reward": 0.375, + "reward_std": 0.6789814233779907, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6355764865875244, + "sampling/importance_sampling_ratio/mean": 1.0000407695770264, + "sampling/importance_sampling_ratio/min": 0.4782702624797821, + "sampling/sampling_logp_difference/max": 0.737579345703125, + "sampling/sampling_logp_difference/mean": 0.015945661813020706, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 191.84375, + "completions/mean_terminated_length": 191.84375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.22283464670181274, + "epoch": 2.659313725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5947561048545837, + "kl": 0.07634736597537994, + "learning_rate": 3.981718965929959e-08, + "loss": 0.023, + "num_tokens": 68309816.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.894351601600647, + "sampling/importance_sampling_ratio/mean": 0.9995042681694031, + "sampling/importance_sampling_ratio/min": 0.6615972518920898, + "sampling/sampling_logp_difference/max": 0.6388766765594482, + "sampling/sampling_logp_difference/mean": 0.01337276678532362, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 173.484375, + "completions/mean_terminated_length": 173.484375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.18940766155719757, + "epoch": 2.6605392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0193577112878476, + "kl": 0.0752667635679245, + "learning_rate": 3.953907450879407e-08, + "loss": 0.0264, + "num_tokens": 68335895.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.8179810047149658, + "sampling/importance_sampling_ratio/mean": 1.000649094581604, + "sampling/importance_sampling_ratio/min": 0.41839301586151123, + "sampling/sampling_logp_difference/max": 0.8713340759277344, + "sampling/sampling_logp_difference/mean": 0.012507520616054535, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 143.71875, + "completions/mean_terminated_length": 143.71875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.21278055012226105, + "epoch": 2.661764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06850182894093394, + "kl": 0.05794823169708252, + "learning_rate": 3.926189406479613e-08, + "loss": 0.0006, + "num_tokens": 68368245.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9695031642913818, + "sampling/importance_sampling_ratio/mean": 1.0009080171585083, + "sampling/importance_sampling_ratio/min": 0.44676312804222107, + "sampling/sampling_logp_difference/max": 0.8057267665863037, + "sampling/sampling_logp_difference/mean": 0.015010137110948563, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 182.453125, + "completions/mean_terminated_length": 182.453125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2997394800186157, + "epoch": 2.6629901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.876729619808282, + "kl": 0.1063377782702446, + "learning_rate": 3.898564888996475e-08, + "loss": 0.0048, + "num_tokens": 68397090.0, + "reward": 0.15625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.590027093887329, + "sampling/importance_sampling_ratio/mean": 1.0003613233566284, + "sampling/importance_sampling_ratio/min": 0.5680696368217468, + "sampling/sampling_logp_difference/max": 0.5655112266540527, + "sampling/sampling_logp_difference/mean": 0.016017910093069077, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 250.25, + "completions/mean_terminated_length": 250.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.24907062947750092, + "epoch": 2.6642156862745097, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.284869883192685, + "kl": 0.12940922379493713, + "learning_rate": 3.871033954505998e-08, + "loss": 0.0139, + "num_tokens": 68427010.0, + "reward": 0.375, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.7548948526382446, + "sampling/importance_sampling_ratio/mean": 0.9992954730987549, + "sampling/importance_sampling_ratio/min": 0.5566850900650024, + "sampling/sampling_logp_difference/max": 0.5857555270195007, + "sampling/sampling_logp_difference/mean": 0.013966716825962067, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 171.109375, + "completions/mean_terminated_length": 171.109375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2446766197681427, + "epoch": 2.6654411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1051873344208007, + "kl": 0.08149237930774689, + "learning_rate": 3.843596658894232e-08, + "loss": -0.019, + "num_tokens": 68456169.0, + "reward": 0.5, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9990978837013245, + "sampling/importance_sampling_ratio/min": 0.5058292150497437, + "sampling/sampling_logp_difference/max": 0.729045033454895, + "sampling/sampling_logp_difference/mean": 0.014238040894269943, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 156.125, + "completions/mean_terminated_length": 156.125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.2134552299976349, + "epoch": 2.6666666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07743073911001033, + "kl": 0.1175810694694519, + "learning_rate": 3.816253057857144e-08, + "loss": 0.0011, + "num_tokens": 68482337.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000868558883667, + "sampling/importance_sampling_ratio/min": 0.5550065040588379, + "sampling/sampling_logp_difference/max": 0.709916353225708, + "sampling/sampling_logp_difference/mean": 0.014420795254409313, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 174.546875, + "completions/mean_terminated_length": 174.546875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.23113234341144562, + "epoch": 2.667892156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056168849697236796, + "kl": 0.0727619156241417, + "learning_rate": 3.789003206900537e-08, + "loss": 0.0007, + "num_tokens": 68515156.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5723931789398193, + "sampling/importance_sampling_ratio/mean": 1.0006955862045288, + "sampling/importance_sampling_ratio/min": 0.3827452063560486, + "sampling/sampling_logp_difference/max": 0.960385799407959, + "sampling/sampling_logp_difference/mean": 0.015615207143127918, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 202.28125, + "completions/mean_terminated_length": 202.28125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2749329209327698, + "epoch": 2.6691176470588234, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6762929868067675, + "kl": 0.09398925304412842, + "learning_rate": 3.7618471613398597e-08, + "loss": -0.0335, + "num_tokens": 68552822.0, + "reward": 0.4375, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997743368148804, + "sampling/importance_sampling_ratio/min": 0.29177358746528625, + "sampling/sampling_logp_difference/max": 1.2317771911621094, + "sampling/sampling_logp_difference/mean": 0.01894037052989006, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 202.796875, + "completions/mean_terminated_length": 202.796875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3282843828201294, + "epoch": 2.670343137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6705478242006395, + "kl": 0.13938282430171967, + "learning_rate": 3.734784976300165e-08, + "loss": -0.018, + "num_tokens": 68587961.0, + "reward": 0.1875, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6016663312911987, + "sampling/importance_sampling_ratio/mean": 0.999578058719635, + "sampling/importance_sampling_ratio/min": 0.4243606925010681, + "sampling/sampling_logp_difference/max": 0.8571715354919434, + "sampling/sampling_logp_difference/mean": 0.017624136060476303, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 145.421875, + "completions/mean_terminated_length": 145.421875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.20929139852523804, + "epoch": 2.6715686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6164602959815932, + "kl": 0.10157999396324158, + "learning_rate": 3.7078167067159826e-08, + "loss": -0.0035, + "num_tokens": 68612260.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.7619878053665161, + "sampling/importance_sampling_ratio/mean": 1.0003855228424072, + "sampling/importance_sampling_ratio/min": 0.6008952856063843, + "sampling/sampling_logp_difference/max": 0.566442608833313, + "sampling/sampling_logp_difference/mean": 0.012949692085385323, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 183.5, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.25185340642929077, + "epoch": 2.672794117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1774559193845284, + "kl": 0.08610805869102478, + "learning_rate": 3.6809424073311944e-08, + "loss": 0.0065, + "num_tokens": 68643444.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.855698823928833, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.3847578763961792, + "sampling/sampling_logp_difference/max": 0.9551410675048828, + "sampling/sampling_logp_difference/mean": 0.018331876024603844, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 181.625, + "completions/mean_terminated_length": 181.625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.22100377082824707, + "epoch": 2.674019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6092179363665533, + "kl": 0.09422071278095245, + "learning_rate": 3.654162132698918e-08, + "loss": 0.0541, + "num_tokens": 68670236.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6084016561508179, + "sampling/importance_sampling_ratio/mean": 1.0002474784851074, + "sampling/importance_sampling_ratio/min": 0.49823063611984253, + "sampling/sampling_logp_difference/max": 0.6966922283172607, + "sampling/sampling_logp_difference/mean": 0.012405113317072392, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 216.40625, + "completions/mean_terminated_length": 216.40625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.24076037108898163, + "epoch": 2.6752450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0609488661572175, + "kl": 0.12102796882390976, + "learning_rate": 3.627475937181407e-08, + "loss": 0.009, + "num_tokens": 68705414.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6280242204666138, + "sampling/importance_sampling_ratio/mean": 1.0000336170196533, + "sampling/importance_sampling_ratio/min": 0.5584572553634644, + "sampling/sampling_logp_difference/max": 0.5825772285461426, + "sampling/sampling_logp_difference/mean": 0.01332184486091137, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 172.8125, + "completions/mean_terminated_length": 172.8125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.24437132477760315, + "epoch": 2.6764705882352944, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.127945876833359, + "kl": 0.09145861119031906, + "learning_rate": 3.600883874949967e-08, + "loss": 0.0149, + "num_tokens": 68732266.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5333822965621948, + "sampling/importance_sampling_ratio/mean": 1.0007474422454834, + "sampling/importance_sampling_ratio/min": 0.5185667276382446, + "sampling/sampling_logp_difference/max": 0.656686544418335, + "sampling/sampling_logp_difference/mean": 0.014701835811138153, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 206.671875, + "completions/mean_terminated_length": 206.671875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2812899649143219, + "epoch": 2.6776960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2944153670239382, + "kl": 0.0997249186038971, + "learning_rate": 3.574385999984786e-08, + "loss": 0.0004, + "num_tokens": 68763621.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6091861724853516, + "sampling/importance_sampling_ratio/mean": 1.000199556350708, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.4826626777648926, + "sampling/sampling_logp_difference/mean": 0.015883859246969223, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 187.96875, + "completions/mean_terminated_length": 187.96875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.18499572575092316, + "epoch": 2.678921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.462843051683198, + "kl": 0.06560080498456955, + "learning_rate": 3.54798236607487e-08, + "loss": -0.0288, + "num_tokens": 68790355.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5314031839370728, + "sampling/importance_sampling_ratio/mean": 0.9995941519737244, + "sampling/importance_sampling_ratio/min": 0.2466481328010559, + "sampling/sampling_logp_difference/max": 1.3997925519943237, + "sampling/sampling_logp_difference/mean": 0.012036822736263275, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 182.6875, + "completions/mean_terminated_length": 182.6875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2512951195240021, + "epoch": 2.6801470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4222979639673834, + "kl": 0.07893835008144379, + "learning_rate": 3.5216730268179337e-08, + "loss": -0.0234, + "num_tokens": 68824447.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6204001903533936, + "sampling/importance_sampling_ratio/mean": 1.0002648830413818, + "sampling/importance_sampling_ratio/min": 0.6091734766960144, + "sampling/sampling_logp_difference/max": 0.4956521987915039, + "sampling/sampling_logp_difference/mean": 0.014549204148352146, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 165.203125, + "completions/mean_terminated_length": 165.203125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.22966158390045166, + "epoch": 2.681372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3408040168452255, + "kl": 0.07859854400157928, + "learning_rate": 3.495458035620252e-08, + "loss": 0.0078, + "num_tokens": 68851980.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.8719398975372314, + "sampling/importance_sampling_ratio/mean": 0.9992271661758423, + "sampling/importance_sampling_ratio/min": 0.5702707171440125, + "sampling/sampling_logp_difference/max": 0.6269752979278564, + "sampling/sampling_logp_difference/mean": 0.014652803540229797, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 223.046875, + "completions/mean_terminated_length": 223.046875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.29275381565093994, + "epoch": 2.6825980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.282237347488146, + "kl": 0.12887243926525116, + "learning_rate": 3.469337445696629e-08, + "loss": -0.1028, + "num_tokens": 68883711.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.7657581567764282, + "sampling/importance_sampling_ratio/mean": 0.9999592304229736, + "sampling/importance_sampling_ratio/min": 0.47417914867401123, + "sampling/sampling_logp_difference/max": 0.7461700439453125, + "sampling/sampling_logp_difference/mean": 0.01636071503162384, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 174.046875, + "completions/mean_terminated_length": 174.046875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.2956045866012573, + "epoch": 2.6838235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.439245239378223, + "kl": 0.11607477068901062, + "learning_rate": 3.4433113100701683e-08, + "loss": -0.0151, + "num_tokens": 68911890.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007710456848145, + "sampling/importance_sampling_ratio/min": 0.5396926403045654, + "sampling/sampling_logp_difference/max": 0.776573657989502, + "sampling/sampling_logp_difference/mean": 0.016570597887039185, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.22272804379463196, + "epoch": 2.685049019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3186346124254964, + "kl": 0.06915993988513947, + "learning_rate": 3.417379681572296e-08, + "loss": 0.0034, + "num_tokens": 68941978.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6963083744049072, + "sampling/importance_sampling_ratio/mean": 0.9999330639839172, + "sampling/importance_sampling_ratio/min": 0.42088425159454346, + "sampling/sampling_logp_difference/max": 0.8653974533081055, + "sampling/sampling_logp_difference/mean": 0.014592324383556843, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 168.34375, + "completions/mean_terminated_length": 168.34375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2185148000717163, + "epoch": 2.686274509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.230048376251708, + "kl": 0.08221367746591568, + "learning_rate": 3.391542612842574e-08, + "loss": -0.0049, + "num_tokens": 68972128.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.7236268520355225, + "sampling/importance_sampling_ratio/mean": 1.000931978225708, + "sampling/importance_sampling_ratio/min": 0.5048971176147461, + "sampling/sampling_logp_difference/max": 0.6834006309509277, + "sampling/sampling_logp_difference/mean": 0.014054270461201668, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 209.46875, + "completions/mean_terminated_length": 209.46875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.22842177748680115, + "epoch": 2.6875, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6492742690847177, + "kl": 0.09010639786720276, + "learning_rate": 3.365800156328619e-08, + "loss": -0.0249, + "num_tokens": 69004974.0, + "reward": 0.0625, + "reward_std": 0.5123475193977356, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7725313901901245, + "sampling/importance_sampling_ratio/mean": 0.9995955228805542, + "sampling/importance_sampling_ratio/min": 0.6161969900131226, + "sampling/sampling_logp_difference/max": 0.5724086761474609, + "sampling/sampling_logp_difference/mean": 0.012595223262906075, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 154.90625, + "completions/mean_terminated_length": 154.90625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.25736111402511597, + "epoch": 2.688725490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05073242573608991, + "kl": 0.07285177707672119, + "learning_rate": 3.3401523642859805e-08, + "loss": 0.0007, + "num_tokens": 69036392.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5882704257965088, + "sampling/importance_sampling_ratio/mean": 0.9993676543235779, + "sampling/importance_sampling_ratio/min": 0.41003796458244324, + "sampling/sampling_logp_difference/max": 0.8915054798126221, + "sampling/sampling_logp_difference/mean": 0.017075642943382263, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 147.296875, + "completions/mean_terminated_length": 147.296875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.21071837842464447, + "epoch": 2.689950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3758237494845689, + "kl": 0.06788364052772522, + "learning_rate": 3.3145992887780475e-08, + "loss": -0.0024, + "num_tokens": 69062539.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5085123777389526, + "sampling/importance_sampling_ratio/mean": 0.9999905228614807, + "sampling/importance_sampling_ratio/min": 0.49252715706825256, + "sampling/sampling_logp_difference/max": 0.7082056999206543, + "sampling/sampling_logp_difference/mean": 0.01327443402260542, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 205.078125, + "completions/mean_terminated_length": 205.078125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.23580729961395264, + "epoch": 2.6911764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.203832190676799, + "kl": 0.07924351096153259, + "learning_rate": 3.289140981675964e-08, + "loss": -0.0017, + "num_tokens": 69093408.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6272674798965454, + "sampling/importance_sampling_ratio/mean": 1.0008111000061035, + "sampling/importance_sampling_ratio/min": 0.6301074028015137, + "sampling/sampling_logp_difference/max": 0.48690223693847656, + "sampling/sampling_logp_difference/mean": 0.013151616789400578, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 245.359375, + "completions/mean_terminated_length": 245.359375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.2598392367362976, + "epoch": 2.6924019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045813206867263846, + "kl": 0.07030776143074036, + "learning_rate": 3.263777494658448e-08, + "loss": 0.0007, + "num_tokens": 69130663.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5993943214416504, + "sampling/importance_sampling_ratio/mean": 1.0002115964889526, + "sampling/importance_sampling_ratio/min": 0.535071074962616, + "sampling/sampling_logp_difference/max": 0.6253557205200195, + "sampling/sampling_logp_difference/mean": 0.01403868943452835, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 205.25, + "completions/mean_terminated_length": 205.25, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.20189495384693146, + "epoch": 2.693627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1070948841503783, + "kl": 0.1220339834690094, + "learning_rate": 3.2385088792118044e-08, + "loss": 0.0019, + "num_tokens": 69159847.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6558984518051147, + "sampling/importance_sampling_ratio/mean": 1.0001466274261475, + "sampling/importance_sampling_ratio/min": 0.6061429977416992, + "sampling/sampling_logp_difference/max": 0.5043437480926514, + "sampling/sampling_logp_difference/mean": 0.01365315355360508, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 185.25, + "completions/mean_terminated_length": 185.25, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.28888899087905884, + "epoch": 2.6948529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07784601669417482, + "kl": 0.1149665042757988, + "learning_rate": 3.2133351866296955e-08, + "loss": 0.0013, + "num_tokens": 69191559.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006163120269775, + "sampling/importance_sampling_ratio/min": 0.4392172694206238, + "sampling/sampling_logp_difference/max": 0.822761058807373, + "sampling/sampling_logp_difference/mean": 0.016775013878941536, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 204.65625, + "completions/mean_terminated_length": 204.65625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.22148284316062927, + "epoch": 2.696078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9038119411824908, + "kl": 0.05128861218690872, + "learning_rate": 3.188256468013139e-08, + "loss": 0.0162, + "num_tokens": 69223041.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5878969430923462, + "sampling/importance_sampling_ratio/mean": 0.9997825622558594, + "sampling/importance_sampling_ratio/min": 0.553889274597168, + "sampling/sampling_logp_difference/max": 0.5907905101776123, + "sampling/sampling_logp_difference/mean": 0.01325621921569109, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 159.0625, + "completions/mean_terminated_length": 159.0625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.28267282247543335, + "epoch": 2.6973039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8340136244817302, + "kl": 0.17508764564990997, + "learning_rate": 3.163272774270348e-08, + "loss": -0.0049, + "num_tokens": 69247621.0, + "reward": 0.3125, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6154650449752808, + "sampling/importance_sampling_ratio/mean": 0.999939501285553, + "sampling/importance_sampling_ratio/min": 0.5038948059082031, + "sampling/sampling_logp_difference/max": 0.6853878498077393, + "sampling/sampling_logp_difference/mean": 0.015061620622873306, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 145.640625, + "completions/mean_terminated_length": 145.640625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2137831449508667, + "epoch": 2.6985294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6664174079426848, + "kl": 0.10704906284809113, + "learning_rate": 3.1383841561166134e-08, + "loss": 0.0061, + "num_tokens": 69269486.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.456510066986084, + "sampling/importance_sampling_ratio/mean": 0.999748945236206, + "sampling/importance_sampling_ratio/min": 0.4057944118976593, + "sampling/sampling_logp_difference/max": 0.9019086360931396, + "sampling/sampling_logp_difference/mean": 0.013272078707814217, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 208.234375, + "completions/mean_terminated_length": 208.234375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.2738074064254761, + "epoch": 2.6997549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7830345545165356, + "kl": 0.12862104177474976, + "learning_rate": 3.1135906640742836e-08, + "loss": 0.0934, + "num_tokens": 69301181.0, + "reward": 0.625, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6089228391647339, + "sampling/importance_sampling_ratio/mean": 1.0008376836776733, + "sampling/importance_sampling_ratio/min": 0.35846590995788574, + "sampling/sampling_logp_difference/max": 1.0259218215942383, + "sampling/sampling_logp_difference/mean": 0.01580899953842163, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 189.5625, + "completions/mean_terminated_length": 189.5625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.24207505583763123, + "epoch": 2.700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.707672381286519, + "kl": 0.07233038544654846, + "learning_rate": 3.088892348472561e-08, + "loss": 0.0326, + "num_tokens": 69331793.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5470654964447021, + "sampling/importance_sampling_ratio/mean": 1.0001225471496582, + "sampling/importance_sampling_ratio/min": 0.3600119948387146, + "sampling/sampling_logp_difference/max": 1.0216180086135864, + "sampling/sampling_logp_difference/mean": 0.014072047546505928, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 183.578125, + "completions/mean_terminated_length": 183.578125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2872757911682129, + "epoch": 2.702205882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2622679481807824, + "kl": 0.13634923100471497, + "learning_rate": 3.064289259447455e-08, + "loss": 0.0036, + "num_tokens": 69357414.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5744361877441406, + "sampling/importance_sampling_ratio/mean": 1.000053882598877, + "sampling/importance_sampling_ratio/min": 0.5513951778411865, + "sampling/sampling_logp_difference/max": 0.5953035354614258, + "sampling/sampling_logp_difference/mean": 0.01701120287179947, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 208.125, + "completions/mean_terminated_length": 208.125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3441547751426697, + "epoch": 2.7034313725490198, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.069953362663289, + "kl": 0.13246379792690277, + "learning_rate": 3.039781446941697e-08, + "loss": -0.0108, + "num_tokens": 69390014.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.86025071144104, + "sampling/importance_sampling_ratio/mean": 1.0000040531158447, + "sampling/importance_sampling_ratio/min": 0.6281057000160217, + "sampling/sampling_logp_difference/max": 0.6207113265991211, + "sampling/sampling_logp_difference/mean": 0.016756637021899223, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 141.234375, + "completions/mean_terminated_length": 141.234375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2297401875257492, + "epoch": 2.704656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13295957964192232, + "kl": 0.08374090492725372, + "learning_rate": 3.015368960704584e-08, + "loss": 0.0009, + "num_tokens": 69416845.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001006126403809, + "sampling/importance_sampling_ratio/min": 0.4029476046562195, + "sampling/sampling_logp_difference/max": 0.9089487791061401, + "sampling/sampling_logp_difference/mean": 0.016758276149630547, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 200.59375, + "completions/mean_terminated_length": 200.59375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.27449631690979004, + "epoch": 2.7058823529411766, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.303810297427504, + "kl": 0.11424297839403152, + "learning_rate": 2.991051850291915e-08, + "loss": 0.0132, + "num_tokens": 69444211.0, + "reward": 0.40625, + "reward_std": 0.5431214570999146, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.8529949188232422, + "sampling/importance_sampling_ratio/mean": 1.00038480758667, + "sampling/importance_sampling_ratio/min": 0.3628750145435333, + "sampling/sampling_logp_difference/max": 1.0136967897415161, + "sampling/sampling_logp_difference/mean": 0.015266727656126022, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 181.15625, + "completions/mean_terminated_length": 181.15625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.23630809783935547, + "epoch": 2.707107843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2042021830786684, + "kl": 0.10635527968406677, + "learning_rate": 2.9668301650658756e-08, + "loss": -0.0039, + "num_tokens": 69476605.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997677803039551, + "sampling/importance_sampling_ratio/min": 0.4353478252887726, + "sampling/sampling_logp_difference/max": 0.8316099643707275, + "sampling/sampling_logp_difference/mean": 0.014236892573535442, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 221.9375, + "completions/mean_terminated_length": 221.9375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.20373618602752686, + "epoch": 2.7083333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8347800298068822, + "kl": 0.05157628282904625, + "learning_rate": 2.9427039541949638e-08, + "loss": -0.0672, + "num_tokens": 69507161.0, + "reward": 0.125, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 0.6146666407585144, + "sampling/sampling_logp_difference/max": 0.48689985275268555, + "sampling/sampling_logp_difference/mean": 0.010913224890828133, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 159.203125, + "completions/mean_terminated_length": 159.203125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2779569625854492, + "epoch": 2.7095588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.08142156977234, + "kl": 0.08717378973960876, + "learning_rate": 2.918673266653865e-08, + "loss": -0.0266, + "num_tokens": 69535110.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6978199481964111, + "sampling/importance_sampling_ratio/mean": 0.9993019104003906, + "sampling/importance_sampling_ratio/min": 0.49558350443840027, + "sampling/sampling_logp_difference/max": 0.702019453048706, + "sampling/sampling_logp_difference/mean": 0.016375428065657616, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 151.96875, + "completions/mean_terminated_length": 151.96875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.20503515005111694, + "epoch": 2.7107843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4751383794409036, + "kl": 0.07435000687837601, + "learning_rate": 2.8947381512233305e-08, + "loss": 0.0156, + "num_tokens": 69562356.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9469518661499023, + "sampling/importance_sampling_ratio/mean": 1.0001492500305176, + "sampling/importance_sampling_ratio/min": 0.48148271441459656, + "sampling/sampling_logp_difference/max": 0.7308850288391113, + "sampling/sampling_logp_difference/mean": 0.014628005214035511, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 211.53125, + "completions/mean_terminated_length": 211.53125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.23537412285804749, + "epoch": 2.7120098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3870547483022524, + "kl": 0.07981985807418823, + "learning_rate": 2.8708986564901504e-08, + "loss": -0.0097, + "num_tokens": 69594150.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6990214586257935, + "sampling/importance_sampling_ratio/mean": 1.000446081161499, + "sampling/importance_sampling_ratio/min": 0.4834754765033722, + "sampling/sampling_logp_difference/max": 0.7267546653747559, + "sampling/sampling_logp_difference/mean": 0.015160782262682915, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 181.890625, + "completions/mean_terminated_length": 181.890625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.17048130929470062, + "epoch": 2.713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04539440661198622, + "kl": 0.04399728402495384, + "learning_rate": 2.8471548308469706e-08, + "loss": 0.0004, + "num_tokens": 69617903.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6598318815231323, + "sampling/importance_sampling_ratio/mean": 1.0003302097320557, + "sampling/importance_sampling_ratio/min": 0.536295473575592, + "sampling/sampling_logp_difference/max": 0.6230700016021729, + "sampling/sampling_logp_difference/mean": 0.012363039888441563, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 180.59375, + "completions/mean_terminated_length": 180.59375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.26669445633888245, + "epoch": 2.7144607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5988910701713328, + "kl": 0.13101765513420105, + "learning_rate": 2.8235067224922802e-08, + "loss": 0.0155, + "num_tokens": 69644197.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.619104266166687, + "sampling/importance_sampling_ratio/mean": 0.9997296929359436, + "sampling/importance_sampling_ratio/min": 0.5253884196281433, + "sampling/sampling_logp_difference/max": 0.6436173915863037, + "sampling/sampling_logp_difference/mean": 0.015141832642257214, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 231.1875, + "completions/mean_terminated_length": 231.1875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.23241564631462097, + "epoch": 2.715686274509804, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8356100270459297, + "kl": 0.08637715876102448, + "learning_rate": 2.799954379430208e-08, + "loss": -0.0297, + "num_tokens": 69680689.0, + "reward": 0.71875, + "reward_std": 0.5809217691421509, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.697717308998108, + "sampling/importance_sampling_ratio/mean": 1.000471591949463, + "sampling/importance_sampling_ratio/min": 0.4300449788570404, + "sampling/sampling_logp_difference/max": 0.8438655138015747, + "sampling/sampling_logp_difference/mean": 0.014142333529889584, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 185.5625, + "completions/mean_terminated_length": 185.5625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2993847131729126, + "epoch": 2.7169117647058822, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.152223952848721, + "kl": 0.15374311804771423, + "learning_rate": 2.7764978494705437e-08, + "loss": -0.0387, + "num_tokens": 69712949.0, + "reward": 0.28125, + "reward_std": 0.7643726468086243, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6132571697235107, + "sampling/importance_sampling_ratio/mean": 0.9998944997787476, + "sampling/importance_sampling_ratio/min": 0.45971566438674927, + "sampling/sampling_logp_difference/max": 0.7771470546722412, + "sampling/sampling_logp_difference/mean": 0.015908479690551758, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 177.203125, + "completions/mean_terminated_length": 177.203125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.16632476449012756, + "epoch": 2.718137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3446729340689112, + "kl": 0.05673764646053314, + "learning_rate": 2.753137180228543e-08, + "loss": -0.0011, + "num_tokens": 69737010.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8623982667922974, + "sampling/importance_sampling_ratio/mean": 0.9996699094772339, + "sampling/importance_sampling_ratio/min": 0.47910451889038086, + "sampling/sampling_logp_difference/max": 0.7358365058898926, + "sampling/sampling_logp_difference/mean": 0.01236045453697443, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 225.03125, + "completions/mean_terminated_length": 225.03125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.3208922743797302, + "epoch": 2.719362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8880314209417628, + "kl": 0.09001030772924423, + "learning_rate": 2.729872419124879e-08, + "loss": -0.0076, + "num_tokens": 69768868.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.513028860092163, + "sampling/importance_sampling_ratio/mean": 1.0003256797790527, + "sampling/importance_sampling_ratio/min": 0.47280260920524597, + "sampling/sampling_logp_difference/max": 0.749077320098877, + "sampling/sampling_logp_difference/mean": 0.016022734344005585, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 156.109375, + "completions/mean_terminated_length": 156.109375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.21761839091777802, + "epoch": 2.7205882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.760803295294139, + "kl": 0.10163723677396774, + "learning_rate": 2.7067036133855636e-08, + "loss": -0.0109, + "num_tokens": 69797995.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6356077194213867, + "sampling/importance_sampling_ratio/mean": 1.0007399320602417, + "sampling/importance_sampling_ratio/min": 0.5676478147506714, + "sampling/sampling_logp_difference/max": 0.5662540197372437, + "sampling/sampling_logp_difference/mean": 0.014694325625896454, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 142.234375, + "completions/mean_terminated_length": 142.234375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2053075134754181, + "epoch": 2.721813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0467044526354985, + "kl": 0.07528539001941681, + "learning_rate": 2.6836308100417872e-08, + "loss": 0.0008, + "num_tokens": 69825258.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996945261955261, + "sampling/importance_sampling_ratio/min": 0.613776445388794, + "sampling/sampling_logp_difference/max": 0.955998420715332, + "sampling/sampling_logp_difference/mean": 0.012028173543512821, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 162.890625, + "completions/mean_terminated_length": 162.890625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2621799111366272, + "epoch": 2.7230392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2452878397876777, + "kl": 0.11138811707496643, + "learning_rate": 2.6606540559298952e-08, + "loss": 0.0125, + "num_tokens": 69854243.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.627645492553711, + "sampling/importance_sampling_ratio/mean": 1.0003089904785156, + "sampling/importance_sampling_ratio/min": 0.621735692024231, + "sampling/sampling_logp_difference/max": 0.4871344566345215, + "sampling/sampling_logp_difference/mean": 0.013921591453254223, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 204.71875, + "completions/mean_terminated_length": 204.71875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.18779292702674866, + "epoch": 2.724264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039374648181891785, + "kl": 0.04470675811171532, + "learning_rate": 2.6377733976912232e-08, + "loss": 0.0004, + "num_tokens": 69881889.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7185267210006714, + "sampling/importance_sampling_ratio/mean": 1.000443696975708, + "sampling/importance_sampling_ratio/min": 0.48124879598617554, + "sampling/sampling_logp_difference/max": 0.7313709259033203, + "sampling/sampling_logp_difference/mean": 0.013233638368546963, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 191.03125, + "completions/mean_terminated_length": 191.03125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.24338199198246002, + "epoch": 2.7254901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07125307899557784, + "kl": 0.08830739557743073, + "learning_rate": 2.6149888817720733e-08, + "loss": 0.001, + "num_tokens": 69913427.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8971585035324097, + "sampling/importance_sampling_ratio/mean": 1.0001130104064941, + "sampling/importance_sampling_ratio/min": 0.4601728022098541, + "sampling/sampling_logp_difference/max": 0.7761532068252563, + "sampling/sampling_logp_difference/mean": 0.014650973491370678, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 196.484375, + "completions/mean_terminated_length": 196.484375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.23095597326755524, + "epoch": 2.7267156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04004652586011617, + "kl": 0.04922451078891754, + "learning_rate": 2.5923005544235545e-08, + "loss": 0.0005, + "num_tokens": 69949442.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6214406490325928, + "sampling/importance_sampling_ratio/mean": 1.0002517700195312, + "sampling/importance_sampling_ratio/min": 0.4137243926525116, + "sampling/sampling_logp_difference/max": 0.8825552463531494, + "sampling/sampling_logp_difference/mean": 0.013647917658090591, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 236.5, + "completions/mean_terminated_length": 236.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.27529871463775635, + "epoch": 2.7279411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19390980695616739, + "kl": 0.07142721861600876, + "learning_rate": 2.5697084617015475e-08, + "loss": 0.0007, + "num_tokens": 69987986.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.674102783203125, + "sampling/importance_sampling_ratio/mean": 0.9996586441993713, + "sampling/importance_sampling_ratio/min": 0.3969041407108307, + "sampling/sampling_logp_difference/max": 0.9240604639053345, + "sampling/sampling_logp_difference/mean": 0.015878435224294662, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 224.015625, + "completions/mean_terminated_length": 224.015625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.21713590621948242, + "epoch": 2.7291666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06094074724350776, + "kl": 0.05968696251511574, + "learning_rate": 2.547212649466568e-08, + "loss": 0.0006, + "num_tokens": 70024403.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998601675033569, + "sampling/importance_sampling_ratio/min": 0.08962565660476685, + "sampling/sampling_logp_difference/max": 2.412113666534424, + "sampling/sampling_logp_difference/mean": 0.014678400941193104, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 155.484375, + "completions/mean_terminated_length": 155.484375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.27618515491485596, + "epoch": 2.730392156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.98056953965246, + "kl": 0.15442435443401337, + "learning_rate": 2.5248131633836823e-08, + "loss": 0.0119, + "num_tokens": 70057922.0, + "reward": -0.375, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6089198589324951, + "sampling/importance_sampling_ratio/mean": 0.9995495080947876, + "sampling/importance_sampling_ratio/min": 0.5513139963150024, + "sampling/sampling_logp_difference/max": 0.595450758934021, + "sampling/sampling_logp_difference/mean": 0.017073802649974823, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 195.03125, + "completions/mean_terminated_length": 195.03125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.30946943163871765, + "epoch": 2.7316176470588234, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9276934903335479, + "kl": 0.12449204176664352, + "learning_rate": 2.5025100489224406e-08, + "loss": 0.002, + "num_tokens": 70089252.0, + "reward": 0.875, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002161264419556, + "sampling/importance_sampling_ratio/min": 0.5910323262214661, + "sampling/sampling_logp_difference/max": 0.7526144981384277, + "sampling/sampling_logp_difference/mean": 0.01713874749839306, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 236.46875, + "completions/mean_terminated_length": 236.46875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.29705095291137695, + "epoch": 2.732843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.14257301574741, + "kl": 0.17638657987117767, + "learning_rate": 2.480303351356733e-08, + "loss": -0.001, + "num_tokens": 70125650.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995177984237671, + "sampling/importance_sampling_ratio/min": 0.006017514504492283, + "sampling/sampling_logp_difference/max": 5.113080978393555, + "sampling/sampling_logp_difference/mean": 0.016602501273155212, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 154.0, + "completions/mean_terminated_length": 154.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.24172338843345642, + "epoch": 2.7340686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3895605040650352, + "kl": 0.10444362461566925, + "learning_rate": 2.4581931157647674e-08, + "loss": 0.0014, + "num_tokens": 70152914.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4717568159103394, + "sampling/importance_sampling_ratio/mean": 0.9994391202926636, + "sampling/importance_sampling_ratio/min": 0.5495074391365051, + "sampling/sampling_logp_difference/max": 0.5987329483032227, + "sampling/sampling_logp_difference/mean": 0.014560364186763763, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 173.40625, + "completions/mean_terminated_length": 173.40625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.19719503819942474, + "epoch": 2.735294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05655211858590894, + "kl": 0.053818728774785995, + "learning_rate": 2.4361793870289028e-08, + "loss": 0.0006, + "num_tokens": 70182156.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7174328565597534, + "sampling/importance_sampling_ratio/mean": 1.0005311965942383, + "sampling/importance_sampling_ratio/min": 0.5483784079551697, + "sampling/sampling_logp_difference/max": 0.6007896661758423, + "sampling/sampling_logp_difference/mean": 0.012780500575900078, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 200.890625, + "completions/mean_terminated_length": 200.890625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.2707655429840088, + "epoch": 2.736519607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3971192336391878, + "kl": 0.08546942472457886, + "learning_rate": 2.4142622098356326e-08, + "loss": -0.0203, + "num_tokens": 70213301.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007374286651611, + "sampling/importance_sampling_ratio/min": 0.4582717716693878, + "sampling/sampling_logp_difference/max": 1.49442720413208, + "sampling/sampling_logp_difference/mean": 0.01658693701028824, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 199.40625, + "completions/mean_terminated_length": 199.40625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.24402666091918945, + "epoch": 2.7377450980392157, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9094111257876882, + "kl": 0.07704515755176544, + "learning_rate": 2.3924416286754345e-08, + "loss": -0.0374, + "num_tokens": 70241343.0, + "reward": 0.46875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002003908157349, + "sampling/importance_sampling_ratio/min": 0.4584660232067108, + "sampling/sampling_logp_difference/max": 0.7798690795898438, + "sampling/sampling_logp_difference/mean": 0.014194745570421219, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 179.640625, + "completions/mean_terminated_length": 179.640625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.1955706626176834, + "epoch": 2.7389705882352944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049345331508012194, + "kl": 0.05805432051420212, + "learning_rate": 2.3707176878426882e-08, + "loss": 0.0006, + "num_tokens": 70270840.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9099674224853516, + "sampling/importance_sampling_ratio/mean": 1.0000171661376953, + "sampling/importance_sampling_ratio/min": 0.5362978577613831, + "sampling/sampling_logp_difference/max": 0.6470861434936523, + "sampling/sampling_logp_difference/mean": 0.012442233972251415, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 141.59375, + "completions/mean_terminated_length": 141.59375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.17659026384353638, + "epoch": 2.7401960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08369664875217793, + "kl": 0.07052315026521683, + "learning_rate": 2.3490904314356407e-08, + "loss": 0.0007, + "num_tokens": 70297630.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5473384857177734, + "sampling/importance_sampling_ratio/mean": 1.0005820989608765, + "sampling/importance_sampling_ratio/min": 0.6202817559242249, + "sampling/sampling_logp_difference/max": 0.4775815010070801, + "sampling/sampling_logp_difference/mean": 0.010419541969895363, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 222.96875, + "completions/mean_terminated_length": 222.96875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.18989717960357666, + "epoch": 2.741421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1308519337501126, + "kl": 0.04831966757774353, + "learning_rate": 2.327559903356241e-08, + "loss": 0.0332, + "num_tokens": 70337260.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999310314655304, + "sampling/importance_sampling_ratio/min": 0.5668860673904419, + "sampling/sampling_logp_difference/max": 0.7492504119873047, + "sampling/sampling_logp_difference/mean": 0.011877134442329407, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 162.984375, + "completions/mean_terminated_length": 162.984375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.19407083094120026, + "epoch": 2.7426470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06408573025545625, + "kl": 0.07185148447751999, + "learning_rate": 2.3061261473101002e-08, + "loss": 0.0007, + "num_tokens": 70369323.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002174377441406, + "sampling/importance_sampling_ratio/min": 0.6254509687423706, + "sampling/sampling_logp_difference/max": 0.9231328964233398, + "sampling/sampling_logp_difference/mean": 0.012973986566066742, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 189.78125, + "completions/mean_terminated_length": 189.78125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.3031126856803894, + "epoch": 2.743872549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9924284485712511, + "kl": 0.12816128134727478, + "learning_rate": 2.2847892068063755e-08, + "loss": -0.0145, + "num_tokens": 70403197.0, + "reward": 0.5, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000558614730835, + "sampling/importance_sampling_ratio/min": 0.48257020115852356, + "sampling/sampling_logp_difference/max": 0.8784655332565308, + "sampling/sampling_logp_difference/mean": 0.017242250964045525, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 209.640625, + "completions/mean_terminated_length": 209.640625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.24757881462574005, + "epoch": 2.7450980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041347249130428226, + "kl": 0.05959932878613472, + "learning_rate": 2.263549125157721e-08, + "loss": 0.0006, + "num_tokens": 70437350.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6208202838897705, + "sampling/importance_sampling_ratio/mean": 0.9997664093971252, + "sampling/importance_sampling_ratio/min": 0.19745701551437378, + "sampling/sampling_logp_difference/max": 1.6222343444824219, + "sampling/sampling_logp_difference/mean": 0.015598781406879425, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 179.703125, + "completions/mean_terminated_length": 179.703125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.22596284747123718, + "epoch": 2.7463235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5586376139596525, + "kl": 0.10207671672105789, + "learning_rate": 2.242405945480147e-08, + "loss": 0.004, + "num_tokens": 70465043.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5484459400177002, + "sampling/importance_sampling_ratio/mean": 1.000705599784851, + "sampling/importance_sampling_ratio/min": 0.5239237546920776, + "sampling/sampling_logp_difference/max": 0.6464091539382935, + "sampling/sampling_logp_difference/mean": 0.01542605459690094, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 211.46875, + "completions/mean_terminated_length": 211.46875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.23992714285850525, + "epoch": 2.747549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.659197625635065, + "kl": 0.05521634966135025, + "learning_rate": 2.2213597106929605e-08, + "loss": -0.0725, + "num_tokens": 70500913.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.768282175064087, + "sampling/importance_sampling_ratio/mean": 1.0001716613769531, + "sampling/importance_sampling_ratio/min": 0.47808602452278137, + "sampling/sampling_logp_difference/max": 0.7379646301269531, + "sampling/sampling_logp_difference/mean": 0.016141241416335106, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 182.53125, + "completions/mean_terminated_length": 182.53125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2644326388835907, + "epoch": 2.748774509803922, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0797480351914714, + "kl": 0.12182864546775818, + "learning_rate": 2.200410463518704e-08, + "loss": 0.0061, + "num_tokens": 70527683.0, + "reward": 0.25, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.656259298324585, + "sampling/importance_sampling_ratio/mean": 0.999809980392456, + "sampling/importance_sampling_ratio/min": 0.5676479339599609, + "sampling/sampling_logp_difference/max": 0.5662539005279541, + "sampling/sampling_logp_difference/mean": 0.015558083541691303, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 234.390625, + "completions/mean_terminated_length": 234.390625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.17076224088668823, + "epoch": 2.75, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04480063796229846, + "kl": 0.0427418127655983, + "learning_rate": 2.1795582464830153e-08, + "loss": 0.0004, + "num_tokens": 70558124.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999072551727295, + "sampling/importance_sampling_ratio/min": 0.4287963807582855, + "sampling/sampling_logp_difference/max": 0.8467731475830078, + "sampling/sampling_logp_difference/mean": 0.011818887665867805, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 158.4375, + "completions/mean_terminated_length": 158.4375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.1844603419303894, + "epoch": 2.751225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5747770439482864, + "kl": 0.12585288286209106, + "learning_rate": 2.1588031019145636e-08, + "loss": 0.0014, + "num_tokens": 70584200.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6396639347076416, + "sampling/importance_sampling_ratio/mean": 0.999675452709198, + "sampling/importance_sampling_ratio/min": 0.1735827922821045, + "sampling/sampling_logp_difference/max": 1.7511005401611328, + "sampling/sampling_logp_difference/mean": 0.011955867521464825, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 172.640625, + "completions/mean_terminated_length": 172.640625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.21045391261577606, + "epoch": 2.752450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049648451221509, + "kl": 0.06508484482765198, + "learning_rate": 2.13814507194498e-08, + "loss": 0.0006, + "num_tokens": 70612209.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9963213205337524, + "sampling/importance_sampling_ratio/mean": 1.0002024173736572, + "sampling/importance_sampling_ratio/min": 0.606671154499054, + "sampling/sampling_logp_difference/max": 0.6913061141967773, + "sampling/sampling_logp_difference/mean": 0.013279307633638382, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 213.3125, + "completions/mean_terminated_length": 213.3125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3622591495513916, + "epoch": 2.7536764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2309385357333875, + "kl": 0.097495436668396, + "learning_rate": 2.1175841985087707e-08, + "loss": -0.0041, + "num_tokens": 70646645.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5279804468154907, + "sampling/importance_sampling_ratio/mean": 0.9995732307434082, + "sampling/importance_sampling_ratio/min": 0.4854530394077301, + "sampling/sampling_logp_difference/max": 0.722672700881958, + "sampling/sampling_logp_difference/mean": 0.0192355178296566, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 205.828125, + "completions/mean_terminated_length": 205.828125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.20620039105415344, + "epoch": 2.7549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3278783409445232, + "kl": 0.0781605988740921, + "learning_rate": 2.097120523343199e-08, + "loss": -0.0055, + "num_tokens": 70677946.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.601424217224121, + "sampling/importance_sampling_ratio/mean": 0.9992896914482117, + "sampling/importance_sampling_ratio/min": 0.5440990328788757, + "sampling/sampling_logp_difference/max": 0.6086239814758301, + "sampling/sampling_logp_difference/mean": 0.013438969850540161, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 187.9375, + "completions/mean_terminated_length": 187.9375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.312122106552124, + "epoch": 2.756127450980392, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.517810663801415, + "kl": 0.1392800509929657, + "learning_rate": 2.076754087988214e-08, + "loss": 0.0082, + "num_tokens": 70704998.0, + "reward": 0.3125, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.9059869050979614, + "sampling/importance_sampling_ratio/mean": 1.0000531673431396, + "sampling/importance_sampling_ratio/min": 0.61529940366745, + "sampling/sampling_logp_difference/max": 0.6449999809265137, + "sampling/sampling_logp_difference/mean": 0.016311530023813248, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 214.828125, + "completions/mean_terminated_length": 214.828125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.18979710340499878, + "epoch": 2.7573529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1029685330856562, + "kl": 0.05746053159236908, + "learning_rate": 2.0564849337864122e-08, + "loss": -0.0047, + "num_tokens": 70736491.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998486042022705, + "sampling/importance_sampling_ratio/min": 0.4597283601760864, + "sampling/sampling_logp_difference/max": 0.777119517326355, + "sampling/sampling_logp_difference/mean": 0.012833474203944206, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 220.234375, + "completions/mean_terminated_length": 220.234375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.25403162837028503, + "epoch": 2.758578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6124423023997865, + "kl": 0.07223007082939148, + "learning_rate": 2.036313101882875e-08, + "loss": 0.0606, + "num_tokens": 70775386.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001699924468994, + "sampling/importance_sampling_ratio/min": 0.32495278120040894, + "sampling/sampling_logp_difference/max": 1.1240754127502441, + "sampling/sampling_logp_difference/mean": 0.014939755201339722, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 213.171875, + "completions/mean_terminated_length": 213.171875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.34486424922943115, + "epoch": 2.7598039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7254100844364808, + "kl": 0.125754714012146, + "learning_rate": 2.0162386332251648e-08, + "loss": -0.0044, + "num_tokens": 70809413.0, + "reward": 0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6528112888336182, + "sampling/importance_sampling_ratio/mean": 0.999992311000824, + "sampling/importance_sampling_ratio/min": 0.4950985610485077, + "sampling/sampling_logp_difference/max": 0.7029983997344971, + "sampling/sampling_logp_difference/mean": 0.01870739459991455, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 250.265625, + "completions/mean_terminated_length": 250.265625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.279003769159317, + "epoch": 2.7610294117647056, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.8117417539800733, + "kl": 0.06691594421863556, + "learning_rate": 1.9962615685631568e-08, + "loss": -0.0738, + "num_tokens": 70842790.0, + "reward": 0.28125, + "reward_std": 0.6833621263504028, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.9848772287368774, + "sampling/importance_sampling_ratio/mean": 0.999861478805542, + "sampling/importance_sampling_ratio/min": 0.4800865352153778, + "sampling/sampling_logp_difference/max": 0.7337889671325684, + "sampling/sampling_logp_difference/mean": 0.014689471572637558, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 174.765625, + "completions/mean_terminated_length": 174.765625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.16617852449417114, + "epoch": 2.7622549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7688961229273223, + "kl": 0.06370288133621216, + "learning_rate": 1.976381948449035e-08, + "loss": 0.0283, + "num_tokens": 70878055.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000295639038086, + "sampling/importance_sampling_ratio/min": 0.3053562641143799, + "sampling/sampling_logp_difference/max": 1.1862760782241821, + "sampling/sampling_logp_difference/mean": 0.012256176210939884, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 212.578125, + "completions/mean_terminated_length": 212.578125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3126332759857178, + "epoch": 2.763480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1573843302974975, + "kl": 0.09631580114364624, + "learning_rate": 1.9565998132371808e-08, + "loss": -0.0185, + "num_tokens": 70914028.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.879313588142395, + "sampling/importance_sampling_ratio/mean": 0.999764084815979, + "sampling/importance_sampling_ratio/min": 0.5262352824211121, + "sampling/sampling_logp_difference/max": 0.6420068740844727, + "sampling/sampling_logp_difference/mean": 0.01587415672838688, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 232.0, + "completions/mean_terminated_length": 232.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.2396133840084076, + "epoch": 2.764705882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1332077897009292, + "kl": 0.08597449213266373, + "learning_rate": 1.936915203084055e-08, + "loss": -0.0046, + "num_tokens": 70948140.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.7887362241744995, + "sampling/importance_sampling_ratio/mean": 1.0001415014266968, + "sampling/importance_sampling_ratio/min": 0.5966588854789734, + "sampling/sampling_logp_difference/max": 0.5815093517303467, + "sampling/sampling_logp_difference/mean": 0.013765338808298111, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 158.828125, + "completions/mean_terminated_length": 158.828125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.25703465938568115, + "epoch": 2.7659313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4201686527609199, + "kl": 0.10735650360584259, + "learning_rate": 1.9173281579481894e-08, + "loss": 0.002, + "num_tokens": 70975633.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8322973251342773, + "sampling/importance_sampling_ratio/mean": 1.0004141330718994, + "sampling/importance_sampling_ratio/min": 0.3579929769039154, + "sampling/sampling_logp_difference/max": 1.0272419452667236, + "sampling/sampling_logp_difference/mean": 0.01574607938528061, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 155.28125, + "completions/mean_terminated_length": 155.28125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.22355003654956818, + "epoch": 2.767156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3193496489083907, + "kl": 0.07810181379318237, + "learning_rate": 1.897838717590028e-08, + "loss": -0.0119, + "num_tokens": 71008387.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.7434394359588623, + "sampling/importance_sampling_ratio/mean": 0.9994719624519348, + "sampling/importance_sampling_ratio/min": 0.48811832070350647, + "sampling/sampling_logp_difference/max": 0.7171974182128906, + "sampling/sampling_logp_difference/mean": 0.012913529761135578, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 197.421875, + "completions/mean_terminated_length": 197.421875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.18570518493652344, + "epoch": 2.7683823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04140384901991634, + "kl": 0.04976304620504379, + "learning_rate": 1.8784469215719077e-08, + "loss": 0.0005, + "num_tokens": 71040126.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4297953844070435, + "sampling/importance_sampling_ratio/mean": 0.9996899962425232, + "sampling/importance_sampling_ratio/min": 0.4256052076816559, + "sampling/sampling_logp_difference/max": 0.8542431592941284, + "sampling/sampling_logp_difference/mean": 0.01033171359449625, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 199.109375, + "completions/mean_terminated_length": 199.109375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.24861127138137817, + "epoch": 2.769607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1049193177717076, + "kl": 0.09222990274429321, + "learning_rate": 1.8591528092579524e-08, + "loss": -0.0097, + "num_tokens": 71070277.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9989231824874878, + "sampling/importance_sampling_ratio/mean": 1.0008349418640137, + "sampling/importance_sampling_ratio/min": 0.6117143630981445, + "sampling/sampling_logp_difference/max": 0.6926085948944092, + "sampling/sampling_logp_difference/mean": 0.014516171999275684, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 231.25, + "completions/mean_terminated_length": 231.25, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.29695504903793335, + "epoch": 2.7708333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0262807260800688, + "kl": 0.08197468519210815, + "learning_rate": 1.8399564198139707e-08, + "loss": 0.0013, + "num_tokens": 71107221.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6657474040985107, + "sampling/importance_sampling_ratio/mean": 0.9994725584983826, + "sampling/importance_sampling_ratio/min": 0.36581653356552124, + "sampling/sampling_logp_difference/max": 1.0056233406066895, + "sampling/sampling_logp_difference/mean": 0.015666469931602478, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 201.34375, + "completions/mean_terminated_length": 201.34375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.18237395584583282, + "epoch": 2.7720588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04637110740425972, + "kl": 0.048668306320905685, + "learning_rate": 1.8208577922074308e-08, + "loss": 0.0004, + "num_tokens": 71136587.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.672214388847351, + "sampling/importance_sampling_ratio/mean": 1.0000160932540894, + "sampling/importance_sampling_ratio/min": 0.593717098236084, + "sampling/sampling_logp_difference/max": 0.5213522911071777, + "sampling/sampling_logp_difference/mean": 0.011836504563689232, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 172.171875, + "completions/mean_terminated_length": 172.171875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2061919867992401, + "epoch": 2.7732843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0663790791649386, + "kl": 0.07428238540887833, + "learning_rate": 1.8018569652073378e-08, + "loss": 0.0007, + "num_tokens": 71169798.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006260871887207, + "sampling/importance_sampling_ratio/min": 0.5007621049880981, + "sampling/sampling_logp_difference/max": 0.7730910778045654, + "sampling/sampling_logp_difference/mean": 0.014331132173538208, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 184.625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.18149055540561676, + "epoch": 2.7745098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9289501476314372, + "kl": 0.07153521478176117, + "learning_rate": 1.7829539773841608e-08, + "loss": 0.054, + "num_tokens": 71196558.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000271201133728, + "sampling/importance_sampling_ratio/min": 0.4579724669456482, + "sampling/sampling_logp_difference/max": 1.1921019554138184, + "sampling/sampling_logp_difference/mean": 0.011731035076081753, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 219.6875, + "completions/mean_terminated_length": 219.6875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.285845011472702, + "epoch": 2.775735294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05058568161150038, + "kl": 0.06788494437932968, + "learning_rate": 1.7641488671097606e-08, + "loss": 0.0007, + "num_tokens": 71229466.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9552366733551025, + "sampling/importance_sampling_ratio/mean": 1.0000596046447754, + "sampling/importance_sampling_ratio/min": 0.5756021738052368, + "sampling/sampling_logp_difference/max": 0.6705112457275391, + "sampling/sampling_logp_difference/mean": 0.014759533107280731, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 175.4375, + "completions/mean_terminated_length": 175.4375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.21992114186286926, + "epoch": 2.7769607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3331153499690847, + "kl": 0.07214683294296265, + "learning_rate": 1.745441672557335e-08, + "loss": 0.0021, + "num_tokens": 71257478.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6355787515640259, + "sampling/importance_sampling_ratio/mean": 1.000448226928711, + "sampling/importance_sampling_ratio/min": 0.396942138671875, + "sampling/sampling_logp_difference/max": 0.92396479845047, + "sampling/sampling_logp_difference/mean": 0.013438953086733818, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 197.140625, + "completions/mean_terminated_length": 197.140625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.22281494736671448, + "epoch": 2.778186274509804, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.038184728011804, + "kl": 0.09663940966129303, + "learning_rate": 1.7268324317012973e-08, + "loss": 0.0078, + "num_tokens": 71293583.0, + "reward": -0.0625, + "reward_std": 0.5879635810852051, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.608886480331421, + "sampling/importance_sampling_ratio/mean": 0.9999948740005493, + "sampling/importance_sampling_ratio/min": 0.5804426074028015, + "sampling/sampling_logp_difference/max": 0.5439643859863281, + "sampling/sampling_logp_difference/mean": 0.01380915567278862, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 205.640625, + "completions/mean_terminated_length": 205.640625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.251699835062027, + "epoch": 2.7794117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.816421715670366, + "kl": 0.09073509275913239, + "learning_rate": 1.7083211823172184e-08, + "loss": -0.1407, + "num_tokens": 71336360.0, + "reward": -0.3125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.664367914199829, + "sampling/importance_sampling_ratio/mean": 0.9992032051086426, + "sampling/importance_sampling_ratio/min": 0.48935699462890625, + "sampling/sampling_logp_difference/max": 0.714663028717041, + "sampling/sampling_logp_difference/mean": 0.013966759666800499, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 159.28125, + "completions/mean_terminated_length": 159.28125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.23307263851165771, + "epoch": 2.780637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.524201781355275, + "kl": 0.09997309744358063, + "learning_rate": 1.6899079619817792e-08, + "loss": -0.0029, + "num_tokens": 71367162.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.8080004453659058, + "sampling/importance_sampling_ratio/mean": 1.0001411437988281, + "sampling/importance_sampling_ratio/min": 0.6400461792945862, + "sampling/sampling_logp_difference/max": 0.5922214984893799, + "sampling/sampling_logp_difference/mean": 0.0136976707726717, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 198.046875, + "completions/mean_terminated_length": 198.046875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.23901107907295227, + "epoch": 2.781862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2901761842056259, + "kl": 0.05361397564411163, + "learning_rate": 1.6715928080726415e-08, + "loss": 0.0052, + "num_tokens": 71394269.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7307579517364502, + "sampling/importance_sampling_ratio/mean": 1.000693917274475, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.5485594272613525, + "sampling/sampling_logp_difference/mean": 0.01256822794675827, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 193.03125, + "completions/mean_terminated_length": 193.03125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2347640097141266, + "epoch": 2.7830882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2367901541403803, + "kl": 0.07756893336772919, + "learning_rate": 1.653375757768405e-08, + "loss": 0.0092, + "num_tokens": 71429039.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992195963859558, + "sampling/importance_sampling_ratio/min": 0.3659481406211853, + "sampling/sampling_logp_difference/max": 1.0052636861801147, + "sampling/sampling_logp_difference/mean": 0.016058577224612236, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 206.359375, + "completions/mean_terminated_length": 206.359375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.23786799609661102, + "epoch": 2.784313725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0651149788157532, + "kl": 0.11119990795850754, + "learning_rate": 1.6352568480485275e-08, + "loss": 0.0857, + "num_tokens": 71461702.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000898838043213, + "sampling/importance_sampling_ratio/min": 0.5441979169845581, + "sampling/sampling_logp_difference/max": 0.7225730419158936, + "sampling/sampling_logp_difference/mean": 0.014432122930884361, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 234.765625, + "completions/mean_terminated_length": 234.765625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.26328566670417786, + "epoch": 2.7855392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.76476429463243, + "kl": 0.05306101590394974, + "learning_rate": 1.6172361156932547e-08, + "loss": 0.0296, + "num_tokens": 71495927.0, + "reward": 0.3125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.7942874431610107, + "sampling/importance_sampling_ratio/mean": 0.9999489784240723, + "sampling/importance_sampling_ratio/min": 0.6099900603294373, + "sampling/sampling_logp_difference/max": 0.5846079587936401, + "sampling/sampling_logp_difference/mean": 0.014174356125295162, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 160.859375, + "completions/mean_terminated_length": 160.859375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.2133684754371643, + "epoch": 2.786764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04760407754695877, + "kl": 0.08011157065629959, + "learning_rate": 1.5993135972835303e-08, + "loss": 0.0007, + "num_tokens": 71519790.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9553382396697998, + "sampling/importance_sampling_ratio/mean": 1.0002846717834473, + "sampling/importance_sampling_ratio/min": 0.48663073778152466, + "sampling/sampling_logp_difference/max": 0.7202496528625488, + "sampling/sampling_logp_difference/mean": 0.014282667078077793, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 226.59375, + "completions/mean_terminated_length": 226.59375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2044447660446167, + "epoch": 2.7879901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4896579938687486, + "kl": 0.060443829745054245, + "learning_rate": 1.581489329200919e-08, + "loss": -0.0168, + "num_tokens": 71551812.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997121691703796, + "sampling/importance_sampling_ratio/min": 0.378704696893692, + "sampling/sampling_logp_difference/max": 1.0303916931152344, + "sampling/sampling_logp_difference/mean": 0.01223063375800848, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 180.6875, + "completions/mean_terminated_length": 180.6875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.28990983963012695, + "epoch": 2.7892156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.312920820950326, + "kl": 0.14007467031478882, + "learning_rate": 1.5637633476275724e-08, + "loss": 0.0153, + "num_tokens": 71579408.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6084250211715698, + "sampling/importance_sampling_ratio/mean": 1.000478744506836, + "sampling/importance_sampling_ratio/min": 0.3463849425315857, + "sampling/sampling_logp_difference/max": 1.0602045059204102, + "sampling/sampling_logp_difference/mean": 0.01548030786216259, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 211.0, + "completions/mean_terminated_length": 211.0, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.18988262116909027, + "epoch": 2.7904411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0685159768950878, + "kl": 0.05716770514845848, + "learning_rate": 1.5461356885461075e-08, + "loss": 0.0186, + "num_tokens": 71607232.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6598341464996338, + "sampling/importance_sampling_ratio/mean": 1.0002659559249878, + "sampling/importance_sampling_ratio/min": 0.6066067218780518, + "sampling/sampling_logp_difference/max": 0.5067176818847656, + "sampling/sampling_logp_difference/mean": 0.010263778269290924, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 257.953125, + "completions/mean_terminated_length": 257.953125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.28556105494499207, + "epoch": 2.7916666666666665, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.510623953144217, + "kl": 0.1083979457616806, + "learning_rate": 1.528606387739545e-08, + "loss": 0.0404, + "num_tokens": 71649165.0, + "reward": 0.8125, + "reward_std": 0.40311288833618164, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999854564666748, + "sampling/importance_sampling_ratio/min": 0.49440720677375793, + "sampling/sampling_logp_difference/max": 0.7608175277709961, + "sampling/sampling_logp_difference/mean": 0.015845760703086853, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 211.0625, + "completions/mean_terminated_length": 211.0625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.22970786690711975, + "epoch": 2.792892156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6020697058076987, + "kl": 0.10661774128675461, + "learning_rate": 1.5111754807912546e-08, + "loss": 0.0193, + "num_tokens": 71677921.0, + "reward": 0.40625, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5262553691864014, + "sampling/importance_sampling_ratio/mean": 1.0001559257507324, + "sampling/importance_sampling_ratio/min": 0.43244117498397827, + "sampling/sampling_logp_difference/max": 0.8383089900016785, + "sampling/sampling_logp_difference/mean": 0.012312108650803566, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 178.96875, + "completions/mean_terminated_length": 178.96875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.15283872187137604, + "epoch": 2.7941176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03991502990475642, + "kl": 0.04768148809671402, + "learning_rate": 1.493843003084888e-08, + "loss": 0.0005, + "num_tokens": 71711263.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5243661403656006, + "sampling/importance_sampling_ratio/mean": 1.001168966293335, + "sampling/importance_sampling_ratio/min": 0.6410229206085205, + "sampling/sampling_logp_difference/max": 0.44469010829925537, + "sampling/sampling_logp_difference/mean": 0.010422519408166409, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 150.578125, + "completions/mean_terminated_length": 150.578125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.22832804918289185, + "epoch": 2.795343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3769658370315165, + "kl": 0.07298359274864197, + "learning_rate": 1.4766089898042677e-08, + "loss": -0.0075, + "num_tokens": 71737188.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5746679306030273, + "sampling/importance_sampling_ratio/mean": 1.0000483989715576, + "sampling/importance_sampling_ratio/min": 0.5372995734214783, + "sampling/sampling_logp_difference/max": 0.6211994886398315, + "sampling/sampling_logp_difference/mean": 0.013986149802803993, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 169.59375, + "completions/mean_terminated_length": 169.59375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.18246373534202576, + "epoch": 2.7965686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07112009120357819, + "kl": 0.05187829211354256, + "learning_rate": 1.4594734759333482e-08, + "loss": 0.0005, + "num_tokens": 71766746.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004734992980957, + "sampling/importance_sampling_ratio/min": 0.6158343553543091, + "sampling/sampling_logp_difference/max": 1.4400757551193237, + "sampling/sampling_logp_difference/mean": 0.011682495474815369, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 151.296875, + "completions/mean_terminated_length": 151.296875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.2154589593410492, + "epoch": 2.797794117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07547142912461965, + "kl": 0.08189049363136292, + "learning_rate": 1.4424364962561386e-08, + "loss": 0.0008, + "num_tokens": 71794749.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.762123703956604, + "sampling/importance_sampling_ratio/mean": 0.9996479749679565, + "sampling/importance_sampling_ratio/min": 0.5038022994995117, + "sampling/sampling_logp_difference/max": 0.6855714321136475, + "sampling/sampling_logp_difference/mean": 0.012780768796801567, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 152.1875, + "completions/mean_terminated_length": 152.1875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.26381582021713257, + "epoch": 2.799019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9889896342701454, + "kl": 0.13400951027870178, + "learning_rate": 1.4254980853566246e-08, + "loss": 0.0213, + "num_tokens": 71822761.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5837942361831665, + "sampling/importance_sampling_ratio/mean": 1.000157117843628, + "sampling/importance_sampling_ratio/min": 0.5473312735557556, + "sampling/sampling_logp_difference/max": 0.6027010679244995, + "sampling/sampling_logp_difference/mean": 0.015911083668470383, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 166.046875, + "completions/mean_terminated_length": 166.046875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2303771823644638, + "epoch": 2.8002450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3140962471762698, + "kl": 0.13876497745513916, + "learning_rate": 1.4086582776187239e-08, + "loss": -0.0087, + "num_tokens": 71853676.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5536686182022095, + "sampling/importance_sampling_ratio/mean": 0.9994738101959229, + "sampling/importance_sampling_ratio/min": 0.49137696623802185, + "sampling/sampling_logp_difference/max": 0.7105436325073242, + "sampling/sampling_logp_difference/mean": 0.014124227687716484, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 207.421875, + "completions/mean_terminated_length": 207.421875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.2937871813774109, + "epoch": 2.8014705882352944, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.017779204162288, + "kl": 0.08879870176315308, + "learning_rate": 1.3919171072261537e-08, + "loss": 0.0605, + "num_tokens": 71887911.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6441470384597778, + "sampling/importance_sampling_ratio/mean": 0.9999390244483948, + "sampling/importance_sampling_ratio/min": 0.5000088214874268, + "sampling/sampling_logp_difference/max": 0.6931295394897461, + "sampling/sampling_logp_difference/mean": 0.016515297815203667, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 209.140625, + "completions/mean_terminated_length": 209.140625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.21904979646205902, + "epoch": 2.8026960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.194619906314769, + "kl": 0.05970887839794159, + "learning_rate": 1.3752746081624467e-08, + "loss": 0.0022, + "num_tokens": 71921472.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6574615240097046, + "sampling/importance_sampling_ratio/mean": 0.9997144937515259, + "sampling/importance_sampling_ratio/min": 0.6155778169631958, + "sampling/sampling_logp_difference/max": 0.5052871704101562, + "sampling/sampling_logp_difference/mean": 0.013201612047851086, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 234.796875, + "completions/mean_terminated_length": 234.796875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.1851789802312851, + "epoch": 2.803921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0501740668660753, + "kl": 0.06818913668394089, + "learning_rate": 1.3587308142108178e-08, + "loss": 0.0005, + "num_tokens": 71955075.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5082266330718994, + "sampling/importance_sampling_ratio/mean": 0.9994621872901917, + "sampling/importance_sampling_ratio/min": 0.2729172706604004, + "sampling/sampling_logp_difference/max": 1.2985866069793701, + "sampling/sampling_logp_difference/mean": 0.012057261541485786, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 153.21875, + "completions/mean_terminated_length": 153.21875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.20215308666229248, + "epoch": 2.8051470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.04414976885733, + "kl": 0.07899628579616547, + "learning_rate": 1.3422857589541148e-08, + "loss": -0.0029, + "num_tokens": 71979233.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4445340633392334, + "sampling/importance_sampling_ratio/mean": 0.9998143315315247, + "sampling/importance_sampling_ratio/min": 0.49491244554519653, + "sampling/sampling_logp_difference/max": 0.7033743858337402, + "sampling/sampling_logp_difference/mean": 0.012561185285449028, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 185.703125, + "completions/mean_terminated_length": 185.703125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.28129348158836365, + "epoch": 2.806372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0391032427759417, + "kl": 0.16123129427433014, + "learning_rate": 1.3259394757747677e-08, + "loss": -0.0029, + "num_tokens": 72005262.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.8564090728759766, + "sampling/importance_sampling_ratio/mean": 1.0004281997680664, + "sampling/importance_sampling_ratio/min": 0.5909593105316162, + "sampling/sampling_logp_difference/max": 0.6186439990997314, + "sampling/sampling_logp_difference/mean": 0.015385773964226246, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 236.90625, + "completions/mean_terminated_length": 236.90625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.21015983819961548, + "epoch": 2.8075980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5423523965562058, + "kl": 0.05395539849996567, + "learning_rate": 1.3096919978546838e-08, + "loss": 0.0334, + "num_tokens": 72035608.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.8786742687225342, + "sampling/importance_sampling_ratio/mean": 1.000143051147461, + "sampling/importance_sampling_ratio/min": 0.5484020113945007, + "sampling/sampling_logp_difference/max": 0.6305663585662842, + "sampling/sampling_logp_difference/mean": 0.012865163385868073, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 227.421875, + "completions/mean_terminated_length": 227.421875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.1998668611049652, + "epoch": 2.8088235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7835263932126011, + "kl": 0.06374235451221466, + "learning_rate": 1.2935433581752365e-08, + "loss": -0.0049, + "num_tokens": 72067299.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.550827145576477, + "sampling/importance_sampling_ratio/mean": 0.9996589422225952, + "sampling/importance_sampling_ratio/min": 0.6136133670806885, + "sampling/sampling_logp_difference/max": 0.4883902072906494, + "sampling/sampling_logp_difference/mean": 0.012076803483068943, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 230.65625, + "completions/mean_terminated_length": 230.65625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.19293788075447083, + "epoch": 2.810049019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1650726238622964, + "kl": 0.05070199817419052, + "learning_rate": 1.2774935895171091e-08, + "loss": 0.0186, + "num_tokens": 72097997.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.826583981513977, + "sampling/importance_sampling_ratio/mean": 1.0000672340393066, + "sampling/importance_sampling_ratio/min": 0.5401414036750793, + "sampling/sampling_logp_difference/max": 0.6159243583679199, + "sampling/sampling_logp_difference/mean": 0.011585809290409088, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 204.53125, + "completions/mean_terminated_length": 204.53125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.1433851271867752, + "epoch": 2.811274509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2036905104523585, + "kl": 0.060173049569129944, + "learning_rate": 1.2615427244603405e-08, + "loss": 0.0364, + "num_tokens": 72125951.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5772333145141602, + "sampling/importance_sampling_ratio/mean": 1.0000299215316772, + "sampling/importance_sampling_ratio/min": 0.4926629066467285, + "sampling/sampling_logp_difference/max": 0.7079300880432129, + "sampling/sampling_logp_difference/mean": 0.010021264664828777, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 201.234375, + "completions/mean_terminated_length": 201.234375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.20590944588184357, + "epoch": 2.8125, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8979533208446774, + "kl": 0.08708874881267548, + "learning_rate": 1.2456907953841633e-08, + "loss": -0.0018, + "num_tokens": 72154942.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9617762565612793, + "sampling/importance_sampling_ratio/mean": 1.0001060962677002, + "sampling/importance_sampling_ratio/min": 0.5467776656150818, + "sampling/sampling_logp_difference/max": 0.6738502979278564, + "sampling/sampling_logp_difference/mean": 0.011773111298680305, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 208.59375, + "completions/mean_terminated_length": 208.59375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.27260729670524597, + "epoch": 2.813725490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5072733075937548, + "kl": 0.07039877027273178, + "learning_rate": 1.2299378344669986e-08, + "loss": 0.0132, + "num_tokens": 72183188.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.9314006567001343, + "sampling/importance_sampling_ratio/mean": 0.9993040561676025, + "sampling/importance_sampling_ratio/min": 0.4164551794528961, + "sampling/sampling_logp_difference/max": 0.8759764432907104, + "sampling/sampling_logp_difference/mean": 0.015850547701120377, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 255.90625, + "completions/mean_terminated_length": 255.90625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.257384717464447, + "epoch": 2.814950980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.496029110977191, + "kl": 0.06804078817367554, + "learning_rate": 1.2142838736863559e-08, + "loss": -0.0224, + "num_tokens": 72214878.0, + "reward": 0.125, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6147757768630981, + "sampling/importance_sampling_ratio/mean": 1.0000439882278442, + "sampling/importance_sampling_ratio/min": 0.4597201943397522, + "sampling/sampling_logp_difference/max": 0.777137279510498, + "sampling/sampling_logp_difference/mean": 0.014001351781189442, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 147.59375, + "completions/mean_terminated_length": 147.59375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.2722303569316864, + "epoch": 2.8161764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2582469630512934, + "kl": 0.13115857541561127, + "learning_rate": 1.1987289448187777e-08, + "loss": 0.0013, + "num_tokens": 72242196.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000063419342041, + "sampling/importance_sampling_ratio/min": 0.3725292980670929, + "sampling/sampling_logp_difference/max": 0.9874396324157715, + "sampling/sampling_logp_difference/mean": 0.01594376750290394, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 187.375, + "completions/mean_terminated_length": 187.375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.2505417466163635, + "epoch": 2.8174019607843137, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1754835273744177, + "kl": 0.06322623044252396, + "learning_rate": 1.183273079439795e-08, + "loss": -0.042, + "num_tokens": 72276604.0, + "reward": 0.03125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6416311264038086, + "sampling/importance_sampling_ratio/mean": 0.9998761415481567, + "sampling/importance_sampling_ratio/min": 0.5535147190093994, + "sampling/sampling_logp_difference/max": 0.5914669036865234, + "sampling/sampling_logp_difference/mean": 0.01484741736203432, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 184.609375, + "completions/mean_terminated_length": 184.609375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2452416718006134, + "epoch": 2.818627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045131944661177266, + "kl": 0.07852520793676376, + "learning_rate": 1.167916308923822e-08, + "loss": 0.0008, + "num_tokens": 72310803.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8235124349594116, + "sampling/importance_sampling_ratio/mean": 1.0006556510925293, + "sampling/importance_sampling_ratio/min": 0.5999748110771179, + "sampling/sampling_logp_difference/max": 0.6007645130157471, + "sampling/sampling_logp_difference/mean": 0.013840295374393463, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 198.3125, + "completions/mean_terminated_length": 198.3125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.17873989045619965, + "epoch": 2.8198529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3366842538078287, + "kl": 0.077562615275383, + "learning_rate": 1.152658664444145e-08, + "loss": 0.044, + "num_tokens": 72342071.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9442808628082275, + "sampling/importance_sampling_ratio/mean": 1.000274896621704, + "sampling/importance_sampling_ratio/min": 0.3858293294906616, + "sampling/sampling_logp_difference/max": 0.9523601531982422, + "sampling/sampling_logp_difference/mean": 0.012566540390253067, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 192.65625, + "completions/mean_terminated_length": 192.65625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.24401333928108215, + "epoch": 2.821078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03960337741431701, + "kl": 0.06033939868211746, + "learning_rate": 1.1375001769727999e-08, + "loss": 0.0006, + "num_tokens": 72373649.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.649314045906067, + "sampling/importance_sampling_ratio/mean": 0.9994627237319946, + "sampling/importance_sampling_ratio/min": 0.6040035486221313, + "sampling/sampling_logp_difference/max": 0.5041751861572266, + "sampling/sampling_logp_difference/mean": 0.0143990283831954, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 230.015625, + "completions/mean_terminated_length": 230.015625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.24470923840999603, + "epoch": 2.8223039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4262812566610688, + "kl": 0.07444335520267487, + "learning_rate": 1.1224408772805671e-08, + "loss": 0.0377, + "num_tokens": 72407570.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.9373067617416382, + "sampling/importance_sampling_ratio/mean": 1.0001022815704346, + "sampling/importance_sampling_ratio/min": 0.5676461458206177, + "sampling/sampling_logp_difference/max": 0.6612987518310547, + "sampling/sampling_logp_difference/mean": 0.014205005019903183, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 230.21875, + "completions/mean_terminated_length": 230.21875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.2483881264925003, + "epoch": 2.8235294117647056, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.570681491857653, + "kl": 0.08415378630161285, + "learning_rate": 1.1074807959368715e-08, + "loss": -0.005, + "num_tokens": 72438672.0, + "reward": 0.6875, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6552093029022217, + "sampling/importance_sampling_ratio/mean": 1.0005154609680176, + "sampling/importance_sampling_ratio/min": 0.5130184888839722, + "sampling/sampling_logp_difference/max": 0.6674433946609497, + "sampling/sampling_logp_difference/mean": 0.014159232378005981, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 211.140625, + "completions/mean_terminated_length": 211.140625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2508814036846161, + "epoch": 2.8247549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7507579893115275, + "kl": 0.06842119991779327, + "learning_rate": 1.0926199633097154e-08, + "loss": -0.0181, + "num_tokens": 72469945.0, + "reward": 0.0, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000756978988647, + "sampling/importance_sampling_ratio/min": 0.44627147912979126, + "sampling/sampling_logp_difference/max": 0.9812498092651367, + "sampling/sampling_logp_difference/mean": 0.014487473294138908, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.25903528928756714, + "epoch": 2.825980392156863, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.3158324343348116, + "kl": 0.10344567149877548, + "learning_rate": 1.0778584095656685e-08, + "loss": -0.038, + "num_tokens": 72496241.0, + "reward": 0.5, + "reward_std": 0.5879635810852051, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5469341278076172, + "sampling/importance_sampling_ratio/mean": 1.0004578828811646, + "sampling/importance_sampling_ratio/min": 0.46739014983177185, + "sampling/sampling_logp_difference/max": 0.7605909109115601, + "sampling/sampling_logp_difference/mean": 0.016178598627448082, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 195.578125, + "completions/mean_terminated_length": 195.578125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.3704570531845093, + "epoch": 2.827205882352941, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9893595155209258, + "kl": 0.14270761609077454, + "learning_rate": 1.0631961646697384e-08, + "loss": -0.001, + "num_tokens": 72532166.0, + "reward": 0.46875, + "reward_std": 0.7348873615264893, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 0.5401919484138489, + "sampling/sampling_logp_difference/max": 0.9278788566589355, + "sampling/sampling_logp_difference/mean": 0.020588595420122147, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 214.28125, + "completions/mean_terminated_length": 214.28125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.23848213255405426, + "epoch": 2.8284313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09189250060888837, + "kl": 0.08650361001491547, + "learning_rate": 1.0486332583853564e-08, + "loss": 0.0008, + "num_tokens": 72567624.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001864433288574, + "sampling/importance_sampling_ratio/min": 0.393848717212677, + "sampling/sampling_logp_difference/max": 0.931788444519043, + "sampling/sampling_logp_difference/mean": 0.013897368684411049, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 158.75, + "completions/mean_terminated_length": 158.75, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.26923668384552, + "epoch": 2.829656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.517856496680662, + "kl": 0.09672123193740845, + "learning_rate": 1.0341697202742971e-08, + "loss": -0.0092, + "num_tokens": 72594488.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6091969013214111, + "sampling/importance_sampling_ratio/mean": 1.0006260871887207, + "sampling/importance_sampling_ratio/min": 0.6420642733573914, + "sampling/sampling_logp_difference/max": 0.4757351875305176, + "sampling/sampling_logp_difference/mean": 0.014627790078520775, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 210.265625, + "completions/mean_terminated_length": 210.265625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.26740381121635437, + "epoch": 2.8308823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.216964226093934, + "kl": 0.20067372918128967, + "learning_rate": 1.0198055796966253e-08, + "loss": 0.0058, + "num_tokens": 72633081.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997767210006714, + "sampling/importance_sampling_ratio/min": 0.48305249214172363, + "sampling/sampling_logp_difference/max": 0.7920181751251221, + "sampling/sampling_logp_difference/mean": 0.016341229900717735, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 157.4375, + "completions/mean_terminated_length": 157.4375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.24633532762527466, + "epoch": 2.832107843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5443078259153119, + "kl": 0.13186615705490112, + "learning_rate": 1.0055408658106446e-08, + "loss": 0.0013, + "num_tokens": 72660901.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6306818723678589, + "sampling/importance_sampling_ratio/mean": 0.9997966885566711, + "sampling/importance_sampling_ratio/min": 0.38238391280174255, + "sampling/sampling_logp_difference/max": 0.9613301753997803, + "sampling/sampling_logp_difference/mean": 0.016728542745113373, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 235.171875, + "completions/mean_terminated_length": 235.171875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.24795357882976532, + "epoch": 2.8333333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.466908573722674, + "kl": 0.056717436760663986, + "learning_rate": 9.913756075728086e-09, + "loss": 0.01, + "num_tokens": 72693408.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7994662523269653, + "sampling/importance_sampling_ratio/mean": 1.0001740455627441, + "sampling/importance_sampling_ratio/min": 0.6203963756561279, + "sampling/sampling_logp_difference/max": 0.5874900817871094, + "sampling/sampling_logp_difference/mean": 0.01397133432328701, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 182.296875, + "completions/mean_terminated_length": 182.296875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.313896119594574, + "epoch": 2.8345588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7254654486321284, + "kl": 0.10548032075166702, + "learning_rate": 9.77309833737705e-09, + "loss": -0.0251, + "num_tokens": 72724147.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.71084725856781, + "sampling/importance_sampling_ratio/mean": 1.0006495714187622, + "sampling/importance_sampling_ratio/min": 0.4789983928203583, + "sampling/sampling_logp_difference/max": 0.7360580563545227, + "sampling/sampling_logp_difference/mean": 0.01815841719508171, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 200.140625, + "completions/mean_terminated_length": 200.140625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.21204930543899536, + "epoch": 2.8357843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9665245001403913, + "kl": 0.0741671472787857, + "learning_rate": 9.633435728579553e-09, + "loss": 0.0548, + "num_tokens": 72762476.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6213301420211792, + "sampling/importance_sampling_ratio/mean": 0.9997785091400146, + "sampling/importance_sampling_ratio/min": 0.47327375411987305, + "sampling/sampling_logp_difference/max": 0.7480813264846802, + "sampling/sampling_logp_difference/mean": 0.013232581317424774, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 159.359375, + "completions/mean_terminated_length": 159.359375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.1839318573474884, + "epoch": 2.8370098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07242427594017536, + "kl": 0.07046204805374146, + "learning_rate": 9.494768532841868e-09, + "loss": 0.0007, + "num_tokens": 72788339.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996984004974365, + "sampling/importance_sampling_ratio/min": 0.452145516872406, + "sampling/sampling_logp_difference/max": 0.7937512397766113, + "sampling/sampling_logp_difference/mean": 0.012966913171112537, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 238.25, + "completions/mean_terminated_length": 238.25, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.29241546988487244, + "epoch": 2.838235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2005985580410745, + "kl": 0.10323096811771393, + "learning_rate": 9.357097031649664e-09, + "loss": 0.0064, + "num_tokens": 72827155.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002491474151611, + "sampling/importance_sampling_ratio/min": 0.4755294620990753, + "sampling/sampling_logp_difference/max": 0.7449169158935547, + "sampling/sampling_logp_difference/mean": 0.01652509719133377, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 207.546875, + "completions/mean_terminated_length": 207.546875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2902880012989044, + "epoch": 2.8394607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05213254317795866, + "kl": 0.09981686621904373, + "learning_rate": 9.22042150446728e-09, + "loss": 0.0009, + "num_tokens": 72860934.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9124470949172974, + "sampling/importance_sampling_ratio/mean": 0.9997761249542236, + "sampling/importance_sampling_ratio/min": 0.5358520746231079, + "sampling/sampling_logp_difference/max": 0.648383617401123, + "sampling/sampling_logp_difference/mean": 0.015693334862589836, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 154.4375, + "completions/mean_terminated_length": 154.4375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.15571218729019165, + "epoch": 2.840686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06519853609540806, + "kl": 0.05649857223033905, + "learning_rate": 9.084742228737564e-09, + "loss": 0.0006, + "num_tokens": 72886786.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9490232467651367, + "sampling/importance_sampling_ratio/mean": 0.9995454549789429, + "sampling/importance_sampling_ratio/min": 0.1234641969203949, + "sampling/sampling_logp_difference/max": 2.091804027557373, + "sampling/sampling_logp_difference/mean": 0.011238237842917442, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2267010509967804, + "epoch": 2.8419117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4949329142363874, + "kl": 0.05994710326194763, + "learning_rate": 8.95005947988059e-09, + "loss": -0.0171, + "num_tokens": 72921898.0, + "reward": 0.28125, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.483286738395691, + "sampling/importance_sampling_ratio/mean": 0.9996479749679565, + "sampling/importance_sampling_ratio/min": 0.6147379279136658, + "sampling/sampling_logp_difference/max": 0.48655927181243896, + "sampling/sampling_logp_difference/mean": 0.012886783108115196, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 182.25, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.22738264501094818, + "epoch": 2.843137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08576645963729009, + "kl": 0.084224171936512, + "learning_rate": 8.816373531293941e-09, + "loss": 0.0008, + "num_tokens": 72961258.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8171416521072388, + "sampling/importance_sampling_ratio/mean": 1.0003807544708252, + "sampling/importance_sampling_ratio/min": 0.4550248384475708, + "sampling/sampling_logp_difference/max": 0.7874033451080322, + "sampling/sampling_logp_difference/mean": 0.014613781124353409, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 193.875, + "completions/mean_terminated_length": 193.875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2447119653224945, + "epoch": 2.844362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7406505358690478, + "kl": 0.12149256467819214, + "learning_rate": 8.683684654351597e-09, + "loss": 0.0162, + "num_tokens": 72991970.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4657742977142334, + "sampling/importance_sampling_ratio/mean": 1.0001851320266724, + "sampling/importance_sampling_ratio/min": 0.5134783983230591, + "sampling/sampling_logp_difference/max": 0.6665472984313965, + "sampling/sampling_logp_difference/mean": 0.01529346127063036, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 170.390625, + "completions/mean_terminated_length": 170.390625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.20992591977119446, + "epoch": 2.8455882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10045521240962015, + "kl": 0.09384244680404663, + "learning_rate": 8.551993118403656e-09, + "loss": 0.0009, + "num_tokens": 73026075.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7112091779708862, + "sampling/importance_sampling_ratio/mean": 0.9992998838424683, + "sampling/importance_sampling_ratio/min": 0.6041069626808167, + "sampling/sampling_logp_difference/max": 0.5372002124786377, + "sampling/sampling_logp_difference/mean": 0.012351354584097862, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 236.484375, + "completions/mean_terminated_length": 236.484375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2420966625213623, + "epoch": 2.846813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9732764114682843, + "kl": 0.061010293662548065, + "learning_rate": 8.4212991907755e-09, + "loss": 0.0168, + "num_tokens": 73060618.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6221625804901123, + "sampling/importance_sampling_ratio/mean": 1.0001819133758545, + "sampling/importance_sampling_ratio/min": 0.3637353777885437, + "sampling/sampling_logp_difference/max": 1.0113286972045898, + "sampling/sampling_logp_difference/mean": 0.01266421191394329, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 161.1875, + "completions/mean_terminated_length": 161.1875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2921450734138489, + "epoch": 2.8480392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6091477379720234, + "kl": 0.16933825612068176, + "learning_rate": 8.291603136767521e-09, + "loss": -0.0077, + "num_tokens": 73086742.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997191429138184, + "sampling/importance_sampling_ratio/min": 0.5376563668251038, + "sampling/sampling_logp_difference/max": 0.713315486907959, + "sampling/sampling_logp_difference/mean": 0.01770794577896595, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 164.578125, + "completions/mean_terminated_length": 164.578125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.20299825072288513, + "epoch": 2.849264705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5280261820469567, + "kl": 0.08744926005601883, + "learning_rate": 8.16290521965457e-09, + "loss": 0.016, + "num_tokens": 73111387.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000653266906738, + "sampling/importance_sampling_ratio/min": 0.6009537577629089, + "sampling/sampling_logp_difference/max": 0.6948471069335938, + "sampling/sampling_logp_difference/mean": 0.013114454224705696, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 149.890625, + "completions/mean_terminated_length": 149.890625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.19350628554821014, + "epoch": 2.8504901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05586272455051703, + "kl": 0.06838033348321915, + "learning_rate": 8.035205700685165e-09, + "loss": 0.0007, + "num_tokens": 73137668.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004388093948364, + "sampling/importance_sampling_ratio/min": 0.14207322895526886, + "sampling/sampling_logp_difference/max": 1.9514126777648926, + "sampling/sampling_logp_difference/mean": 0.012857876718044281, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 212.40625, + "completions/mean_terminated_length": 212.40625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.22763755917549133, + "epoch": 2.8517156862745097, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6426674429837307, + "kl": 0.06477226316928864, + "learning_rate": 7.908504839081342e-09, + "loss": -0.0483, + "num_tokens": 73168510.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.7704322338104248, + "sampling/importance_sampling_ratio/mean": 0.9998475909233093, + "sampling/importance_sampling_ratio/min": 0.6149999499320984, + "sampling/sampling_logp_difference/max": 0.5712237358093262, + "sampling/sampling_logp_difference/mean": 0.013438526540994644, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 181.125, + "completions/mean_terminated_length": 181.125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.24785181879997253, + "epoch": 2.8529411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.906870275441614, + "kl": 0.11384504288434982, + "learning_rate": 7.7828028920377e-09, + "loss": 0.026, + "num_tokens": 73201878.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.7291810512542725, + "sampling/importance_sampling_ratio/mean": 0.9998856782913208, + "sampling/importance_sampling_ratio/min": 0.4812855124473572, + "sampling/sampling_logp_difference/max": 0.7312946319580078, + "sampling/sampling_logp_difference/mean": 0.01550037320703268, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 172.875, + "completions/mean_terminated_length": 172.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.22001373767852783, + "epoch": 2.8541666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056442269345327326, + "kl": 0.06984420120716095, + "learning_rate": 7.658100114721344e-09, + "loss": 0.0007, + "num_tokens": 73230206.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5333000421524048, + "sampling/importance_sampling_ratio/mean": 0.9998418092727661, + "sampling/importance_sampling_ratio/min": 0.6378348469734192, + "sampling/sampling_logp_difference/max": 0.449675977230072, + "sampling/sampling_logp_difference/mean": 0.012991837225854397, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 233.265625, + "completions/mean_terminated_length": 233.265625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.2905122637748718, + "epoch": 2.855392156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9336762011066053, + "kl": 0.1303703337907791, + "learning_rate": 7.534396760270956e-09, + "loss": 0.0308, + "num_tokens": 73266783.0, + "reward": 0.1875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6623878479003906, + "sampling/importance_sampling_ratio/mean": 1.000367283821106, + "sampling/importance_sampling_ratio/min": 0.4797719717025757, + "sampling/sampling_logp_difference/max": 0.7344443798065186, + "sampling/sampling_logp_difference/mean": 0.015111393295228481, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 242.015625, + "completions/mean_terminated_length": 242.015625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2494461089372635, + "epoch": 2.8566176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.436034974723783, + "kl": 0.06503041088581085, + "learning_rate": 7.411693079796499e-09, + "loss": 0.0066, + "num_tokens": 73299808.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996614456176758, + "sampling/importance_sampling_ratio/min": 0.4776008725166321, + "sampling/sampling_logp_difference/max": 0.7389798760414124, + "sampling/sampling_logp_difference/mean": 0.01508298609405756, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 161.734375, + "completions/mean_terminated_length": 161.734375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2661849558353424, + "epoch": 2.857843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5369105222747956, + "kl": 0.10893932729959488, + "learning_rate": 7.289989322378731e-09, + "loss": -0.0015, + "num_tokens": 73328111.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5277687311172485, + "sampling/importance_sampling_ratio/mean": 0.9992194175720215, + "sampling/importance_sampling_ratio/min": 0.21387483179569244, + "sampling/sampling_logp_difference/max": 1.5423643589019775, + "sampling/sampling_logp_difference/mean": 0.017198316752910614, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 174.28125, + "completions/mean_terminated_length": 174.28125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.24874961376190186, + "epoch": 2.8590686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4183432578176454, + "kl": 0.09126145392656326, + "learning_rate": 7.169285735068531e-09, + "loss": 0.0184, + "num_tokens": 73359169.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.5815569758415222, + "sampling/sampling_logp_difference/max": 0.542046308517456, + "sampling/sampling_logp_difference/mean": 0.013274673372507095, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 154.234375, + "completions/mean_terminated_length": 154.234375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2339939922094345, + "epoch": 2.860294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4186821388430984, + "kl": 0.10646519064903259, + "learning_rate": 7.049582562886513e-09, + "loss": 0.0014, + "num_tokens": 73382592.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6207294464111328, + "sampling/importance_sampling_ratio/mean": 1.0001579523086548, + "sampling/importance_sampling_ratio/min": 0.6015142202377319, + "sampling/sampling_logp_difference/max": 0.5083050727844238, + "sampling/sampling_logp_difference/mean": 0.013911631889641285, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 243.546875, + "completions/mean_terminated_length": 243.546875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.27443066239356995, + "epoch": 2.861519607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5596873343135422, + "kl": 0.06345244497060776, + "learning_rate": 6.930880048822529e-09, + "loss": 0.0419, + "num_tokens": 73415219.0, + "reward": 0.34375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5370264053344727, + "sampling/importance_sampling_ratio/mean": 1.0004478693008423, + "sampling/importance_sampling_ratio/min": 0.4976615905761719, + "sampling/sampling_logp_difference/max": 0.6978349685668945, + "sampling/sampling_logp_difference/mean": 0.014734284020960331, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 142.5, + "completions/mean_terminated_length": 142.5, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.28680717945098877, + "epoch": 2.8627450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.9868483285711989, + "kl": 0.15480905771255493, + "learning_rate": 6.813178433835221e-09, + "loss": 0.0318, + "num_tokens": 73436355.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5985099077224731, + "sampling/importance_sampling_ratio/mean": 1.0001389980316162, + "sampling/importance_sampling_ratio/min": 0.3810408413410187, + "sampling/sampling_logp_difference/max": 0.9648487567901611, + "sampling/sampling_logp_difference/mean": 0.015054525807499886, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 173.515625, + "completions/mean_terminated_length": 173.515625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.24999219179153442, + "epoch": 2.8639705882352944, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.253634920109201, + "kl": 0.09357550740242004, + "learning_rate": 6.696477956851354e-09, + "loss": 0.011, + "num_tokens": 73468244.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996324777603149, + "sampling/importance_sampling_ratio/min": 0.3114042282104492, + "sampling/sampling_logp_difference/max": 1.166663408279419, + "sampling/sampling_logp_difference/mean": 0.015080911107361317, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 143.8125, + "completions/mean_terminated_length": 143.8125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.18993791937828064, + "epoch": 2.8651960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.189872256789966, + "kl": 0.07720914483070374, + "learning_rate": 6.580778854765489e-09, + "loss": 0.0085, + "num_tokens": 73497880.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6078691482543945, + "sampling/importance_sampling_ratio/mean": 0.999321460723877, + "sampling/importance_sampling_ratio/min": 0.4330565929412842, + "sampling/sampling_logp_difference/max": 0.8368868827819824, + "sampling/sampling_logp_difference/mean": 0.013005631975829601, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 243.859375, + "completions/mean_terminated_length": 243.859375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.28375160694122314, + "epoch": 2.866421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1260263439134148, + "kl": 0.08156538009643555, + "learning_rate": 6.4660813624395905e-09, + "loss": 0.0083, + "num_tokens": 73534111.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5071581602096558, + "sampling/importance_sampling_ratio/mean": 0.9996317625045776, + "sampling/importance_sampling_ratio/min": 0.40564998984336853, + "sampling/sampling_logp_difference/max": 0.9022645950317383, + "sampling/sampling_logp_difference/mean": 0.01562865637242794, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 201.3125, + "completions/mean_terminated_length": 201.3125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2539217174053192, + "epoch": 2.8676470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0983780464082489, + "kl": 0.07987286150455475, + "learning_rate": 6.3523857127021905e-09, + "loss": 0.0074, + "num_tokens": 73566035.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000715255737305, + "sampling/importance_sampling_ratio/min": 0.11404351145029068, + "sampling/sampling_logp_difference/max": 2.171175241470337, + "sampling/sampling_logp_difference/mean": 0.014142842963337898, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 189.703125, + "completions/mean_terminated_length": 189.703125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.19674170017242432, + "epoch": 2.868872549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4984189584691552, + "kl": 0.07396071404218674, + "learning_rate": 6.239692136348284e-09, + "loss": 0.0059, + "num_tokens": 73598736.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.8570950031280518, + "sampling/importance_sampling_ratio/mean": 1.0001121759414673, + "sampling/importance_sampling_ratio/min": 0.3313627541065216, + "sampling/sampling_logp_difference/max": 1.104541540145874, + "sampling/sampling_logp_difference/mean": 0.01367383636534214, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 195.09375, + "completions/mean_terminated_length": 195.09375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.25682249665260315, + "epoch": 2.8700980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7423779275368756, + "kl": 0.08152695000171661, + "learning_rate": 6.12800086213866e-09, + "loss": -0.0015, + "num_tokens": 73630582.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.7536267042160034, + "sampling/importance_sampling_ratio/mean": 1.0003193616867065, + "sampling/importance_sampling_ratio/min": 0.5676500201225281, + "sampling/sampling_logp_difference/max": 0.566250205039978, + "sampling/sampling_logp_difference/mean": 0.013737764209508896, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 209.28125, + "completions/mean_terminated_length": 209.28125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3201642632484436, + "epoch": 2.8713235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1829241098472207, + "kl": 0.08646942675113678, + "learning_rate": 6.017312116799566e-09, + "loss": 0.069, + "num_tokens": 73661016.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6496838331222534, + "sampling/importance_sampling_ratio/mean": 1.000123381614685, + "sampling/importance_sampling_ratio/min": 0.5428546071052551, + "sampling/sampling_logp_difference/max": 0.6109137535095215, + "sampling/sampling_logp_difference/mean": 0.01596347987651825, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 139.421875, + "completions/mean_terminated_length": 139.421875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.1933838427066803, + "epoch": 2.872549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1150050333548559, + "kl": 0.07972557842731476, + "learning_rate": 5.907626125022158e-09, + "loss": 0.0008, + "num_tokens": 73686451.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6496334075927734, + "sampling/importance_sampling_ratio/mean": 0.9993104934692383, + "sampling/importance_sampling_ratio/min": 0.5260617733001709, + "sampling/sampling_logp_difference/max": 0.6423366069793701, + "sampling/sampling_logp_difference/mean": 0.012571008875966072, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 169.8125, + "completions/mean_terminated_length": 169.8125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.22068004310131073, + "epoch": 2.873774509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03778412130877862, + "kl": 0.05655568093061447, + "learning_rate": 5.798943109461995e-09, + "loss": 0.0006, + "num_tokens": 73713639.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.464235544204712, + "sampling/importance_sampling_ratio/mean": 0.9997299909591675, + "sampling/importance_sampling_ratio/min": 0.6174396872520447, + "sampling/sampling_logp_difference/max": 0.4821739196777344, + "sampling/sampling_logp_difference/mean": 0.012931197881698608, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 197.65625, + "completions/mean_terminated_length": 197.65625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.2522512674331665, + "epoch": 2.875, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4023857839598615, + "kl": 0.09977763891220093, + "learning_rate": 5.691263290738824e-09, + "loss": 0.0637, + "num_tokens": 73745889.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.550814151763916, + "sampling/importance_sampling_ratio/mean": 1.0001001358032227, + "sampling/importance_sampling_ratio/min": 0.5773372054100037, + "sampling/sampling_logp_difference/max": 0.5493288040161133, + "sampling/sampling_logp_difference/mean": 0.014501580968499184, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 201.0, + "completions/mean_terminated_length": 201.0, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2255808264017105, + "epoch": 2.876225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3497166065174733, + "kl": 0.05652454122900963, + "learning_rate": 5.5845868874357385e-09, + "loss": 0.002, + "num_tokens": 73780145.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5456663370132446, + "sampling/importance_sampling_ratio/mean": 1.0001392364501953, + "sampling/importance_sampling_ratio/min": 0.40637242794036865, + "sampling/sampling_logp_difference/max": 0.9004852771759033, + "sampling/sampling_logp_difference/mean": 0.013296417891979218, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 257.921875, + "completions/mean_terminated_length": 257.921875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.2910671830177307, + "epoch": 2.877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03939602125877812, + "kl": 0.054026078432798386, + "learning_rate": 5.4789141160991314e-09, + "loss": 0.0005, + "num_tokens": 73819852.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.637647032737732, + "sampling/importance_sampling_ratio/mean": 1.000435709953308, + "sampling/importance_sampling_ratio/min": 0.47571611404418945, + "sampling/sampling_logp_difference/max": 0.742933988571167, + "sampling/sampling_logp_difference/mean": 0.016137175261974335, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 169.71875, + "completions/mean_terminated_length": 169.71875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.24733196198940277, + "epoch": 2.8786764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5302278882351905, + "kl": 0.14274589717388153, + "learning_rate": 5.374245191238025e-09, + "loss": -0.0273, + "num_tokens": 73845690.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001418590545654, + "sampling/importance_sampling_ratio/min": 0.6148183941841125, + "sampling/sampling_logp_difference/max": 0.7016294002532959, + "sampling/sampling_logp_difference/mean": 0.014793280512094498, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 204.578125, + "completions/mean_terminated_length": 204.578125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.22955816984176636, + "epoch": 2.8799019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8460039376966723, + "kl": 0.08136755228042603, + "learning_rate": 5.270580325323681e-09, + "loss": -0.0066, + "num_tokens": 73878335.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6463840007781982, + "sampling/importance_sampling_ratio/mean": 0.9999974966049194, + "sampling/importance_sampling_ratio/min": 0.5290601849555969, + "sampling/sampling_logp_difference/max": 0.6366531252861023, + "sampling/sampling_logp_difference/mean": 0.012156964279711246, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 228.90625, + "completions/mean_terminated_length": 228.90625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2115429937839508, + "epoch": 2.881127450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9427606694568517, + "kl": 0.06694163382053375, + "learning_rate": 5.167919728789271e-09, + "loss": -0.0038, + "num_tokens": 73909689.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5651397705078125, + "sampling/importance_sampling_ratio/mean": 0.9999818801879883, + "sampling/importance_sampling_ratio/min": 0.598135769367218, + "sampling/sampling_logp_difference/max": 0.5139374732971191, + "sampling/sampling_logp_difference/mean": 0.01208187360316515, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 203.3125, + "completions/mean_terminated_length": 203.3125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.17354030907154083, + "epoch": 2.8823529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.995867825936497, + "kl": 0.06583705544471741, + "learning_rate": 5.0662636100292086e-09, + "loss": 0.0132, + "num_tokens": 73937341.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.9133031368255615, + "sampling/importance_sampling_ratio/mean": 0.9998100399971008, + "sampling/importance_sampling_ratio/min": 0.45969629287719727, + "sampling/sampling_logp_difference/max": 0.7771892547607422, + "sampling/sampling_logp_difference/mean": 0.01046935748308897, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 231.78125, + "completions/mean_terminated_length": 231.78125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3074193596839905, + "epoch": 2.883578431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.014062199626538, + "kl": 0.0723794549703598, + "learning_rate": 4.965612175399092e-09, + "loss": -0.0166, + "num_tokens": 73976207.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.482747197151184, + "sampling/importance_sampling_ratio/mean": 0.9993435740470886, + "sampling/importance_sampling_ratio/min": 0.4954190254211426, + "sampling/sampling_logp_difference/max": 0.7023513317108154, + "sampling/sampling_logp_difference/mean": 0.016915656626224518, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 203.078125, + "completions/mean_terminated_length": 203.078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.24749070405960083, + "epoch": 2.8848039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.900352336707037, + "kl": 0.08469018340110779, + "learning_rate": 4.865965629214819e-09, + "loss": 0.0537, + "num_tokens": 74008564.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5668904781341553, + "sampling/importance_sampling_ratio/mean": 0.9996317625045776, + "sampling/importance_sampling_ratio/min": 0.5433550477027893, + "sampling/sampling_logp_difference/max": 0.609992265701294, + "sampling/sampling_logp_difference/mean": 0.014950131066143513, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 197.40625, + "completions/mean_terminated_length": 197.40625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.21615588665008545, + "epoch": 2.8860294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3787796597011528, + "kl": 0.07328522205352783, + "learning_rate": 4.767324173752696e-09, + "loss": -0.0242, + "num_tokens": 74036430.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5979527235031128, + "sampling/importance_sampling_ratio/mean": 0.9993993043899536, + "sampling/importance_sampling_ratio/min": 0.5038020610809326, + "sampling/sampling_logp_difference/max": 0.6855719089508057, + "sampling/sampling_logp_difference/mean": 0.011895122937858105, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 168.34375, + "completions/mean_terminated_length": 168.34375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.23322808742523193, + "epoch": 2.8872549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9803163233145722, + "kl": 0.0867251604795456, + "learning_rate": 4.669688009248607e-09, + "loss": 0.0057, + "num_tokens": 74067076.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004045963287354, + "sampling/importance_sampling_ratio/min": 0.08583655208349228, + "sampling/sampling_logp_difference/max": 2.455310344696045, + "sampling/sampling_logp_difference/mean": 0.01472594402730465, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 191.03125, + "completions/mean_terminated_length": 191.03125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.20626437664031982, + "epoch": 2.888480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0205510258812007, + "kl": 0.08031073212623596, + "learning_rate": 4.5730573338976786e-09, + "loss": -0.0087, + "num_tokens": 74094422.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000152349472046, + "sampling/importance_sampling_ratio/min": 0.4937072992324829, + "sampling/sampling_logp_difference/max": 1.0611766576766968, + "sampling/sampling_logp_difference/mean": 0.013788919895887375, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 221.6875, + "completions/mean_terminated_length": 221.6875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.22249484062194824, + "epoch": 2.889705882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.6222429895918276, + "kl": 0.08763054013252258, + "learning_rate": 4.477432343854226e-09, + "loss": 0.0441, + "num_tokens": 74131618.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.8446756601333618, + "sampling/importance_sampling_ratio/mean": 1.0002224445343018, + "sampling/importance_sampling_ratio/min": 0.34300485253334045, + "sampling/sampling_logp_difference/max": 1.0700106620788574, + "sampling/sampling_logp_difference/mean": 0.013853983022272587, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 186.703125, + "completions/mean_terminated_length": 186.703125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.18491169810295105, + "epoch": 2.8909313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.256304694994609, + "kl": 0.05769640952348709, + "learning_rate": 4.382813233230698e-09, + "loss": -0.0553, + "num_tokens": 74160191.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6951230764389038, + "sampling/importance_sampling_ratio/mean": 1.0000591278076172, + "sampling/importance_sampling_ratio/min": 0.6097238659858704, + "sampling/sampling_logp_difference/max": 0.5277553796768188, + "sampling/sampling_logp_difference/mean": 0.011838407255709171, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 166.140625, + "completions/mean_terminated_length": 166.140625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.20564104616641998, + "epoch": 2.892156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.859432394324812, + "kl": 0.08301499485969543, + "learning_rate": 4.289200194098119e-09, + "loss": 0.05, + "num_tokens": 74190792.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.969706654548645, + "sampling/importance_sampling_ratio/mean": 0.999295711517334, + "sampling/importance_sampling_ratio/min": 0.47804099321365356, + "sampling/sampling_logp_difference/max": 0.7380588054656982, + "sampling/sampling_logp_difference/mean": 0.013555385172367096, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 188.640625, + "completions/mean_terminated_length": 188.640625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3259560465812683, + "epoch": 2.8933823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2541628273128105, + "kl": 0.13829918205738068, + "learning_rate": 4.196593416484873e-09, + "loss": -0.0037, + "num_tokens": 74219441.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003265142440796, + "sampling/importance_sampling_ratio/min": 0.6181949973106384, + "sampling/sampling_logp_difference/max": 0.7279109954833984, + "sampling/sampling_logp_difference/mean": 0.01699439249932766, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 181.453125, + "completions/mean_terminated_length": 181.453125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.25785890221595764, + "epoch": 2.894607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5208642846812594, + "kl": 0.08224321901798248, + "learning_rate": 4.104993088376974e-09, + "loss": 0.0062, + "num_tokens": 74246238.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.9391735792160034, + "sampling/importance_sampling_ratio/mean": 1.000069499015808, + "sampling/importance_sampling_ratio/min": 0.6216408014297485, + "sampling/sampling_logp_difference/max": 0.662261962890625, + "sampling/sampling_logp_difference/mean": 0.015074081718921661, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 198.265625, + "completions/mean_terminated_length": 198.265625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2786763310432434, + "epoch": 2.8958333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.678535910803952, + "kl": 0.08939115703105927, + "learning_rate": 4.0143993957171826e-09, + "loss": -0.0006, + "num_tokens": 74283679.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.9539135694503784, + "sampling/importance_sampling_ratio/mean": 0.9996293783187866, + "sampling/importance_sampling_ratio/min": 0.42159610986709595, + "sampling/sampling_logp_difference/max": 0.8637075424194336, + "sampling/sampling_logp_difference/mean": 0.016311483457684517, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 196.984375, + "completions/mean_terminated_length": 196.984375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.17440006136894226, + "epoch": 2.8970588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06420097834190534, + "kl": 0.05318032205104828, + "learning_rate": 3.924812522404952e-09, + "loss": 0.0005, + "num_tokens": 74318910.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6251327991485596, + "sampling/importance_sampling_ratio/mean": 0.9998441934585571, + "sampling/importance_sampling_ratio/min": 0.32394373416900635, + "sampling/sampling_logp_difference/max": 1.1271854639053345, + "sampling/sampling_logp_difference/mean": 0.012006749399006367, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 275.953125, + "completions/mean_terminated_length": 275.953125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2703547477722168, + "epoch": 2.8982843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3364025004222901, + "kl": 0.06834714859724045, + "learning_rate": 3.836232650296034e-09, + "loss": 0.0206, + "num_tokens": 74355803.0, + "reward": 0.3125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6307568550109863, + "sampling/importance_sampling_ratio/mean": 1.0003077983856201, + "sampling/importance_sampling_ratio/min": 0.4889255464076996, + "sampling/sampling_logp_difference/max": 0.7155450582504272, + "sampling/sampling_logp_difference/mean": 0.01356726884841919, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 194.21875, + "completions/mean_terminated_length": 194.21875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2951743006706238, + "epoch": 2.8995098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.151388101507571, + "kl": 0.13011303544044495, + "learning_rate": 3.748659959201928e-09, + "loss": -0.0035, + "num_tokens": 74385001.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999871015548706, + "sampling/importance_sampling_ratio/min": 0.5403658747673035, + "sampling/sampling_logp_difference/max": 0.7130794525146484, + "sampling/sampling_logp_difference/mean": 0.016106517985463142, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 175.0, + "completions/mean_terminated_length": 175.0, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.1530219316482544, + "epoch": 2.900735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.344333949614905, + "kl": 0.07376052439212799, + "learning_rate": 3.6620946268896556e-09, + "loss": -0.0071, + "num_tokens": 74410729.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6253750324249268, + "sampling/importance_sampling_ratio/mean": 0.9999873042106628, + "sampling/importance_sampling_ratio/min": 0.4945722818374634, + "sampling/sampling_logp_difference/max": 0.7040619850158691, + "sampling/sampling_logp_difference/mean": 0.010870207101106644, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 205.03125, + "completions/mean_terminated_length": 205.03125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.21062304079532623, + "epoch": 2.9019607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6114536958132395, + "kl": 0.08530819416046143, + "learning_rate": 3.5765368290813223e-09, + "loss": -0.0212, + "num_tokens": 74442875.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.5103265047073364, + "sampling/importance_sampling_ratio/mean": 1.0002667903900146, + "sampling/importance_sampling_ratio/min": 0.6178181171417236, + "sampling/sampling_logp_difference/max": 0.48156118392944336, + "sampling/sampling_logp_difference/mean": 0.012324854731559753, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 182.75, + "completions/mean_terminated_length": 182.75, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.21767497062683105, + "epoch": 2.903186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1853996194935956, + "kl": 0.07698185741901398, + "learning_rate": 3.491986739453889e-09, + "loss": 0.017, + "num_tokens": 74474731.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.7626488208770752, + "sampling/importance_sampling_ratio/mean": 1.0002050399780273, + "sampling/importance_sampling_ratio/min": 0.5483866930007935, + "sampling/sampling_logp_difference/max": 0.6007745265960693, + "sampling/sampling_logp_difference/mean": 0.013286888599395752, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 150.3125, + "completions/mean_terminated_length": 150.3125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.18176418542861938, + "epoch": 2.9044117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037758449916620904, + "kl": 0.05915432050824165, + "learning_rate": 3.4084445296386767e-09, + "loss": 0.0006, + "num_tokens": 74504671.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8179765939712524, + "sampling/importance_sampling_ratio/mean": 1.0005993843078613, + "sampling/importance_sampling_ratio/min": 0.29257166385650635, + "sampling/sampling_logp_difference/max": 1.2290456295013428, + "sampling/sampling_logp_difference/mean": 0.013469929806888103, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 158.1875, + "completions/mean_terminated_length": 158.1875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2425750195980072, + "epoch": 2.905637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0827567736911625, + "kl": 0.08463536202907562, + "learning_rate": 3.3259103692209745e-09, + "loss": -0.0042, + "num_tokens": 74532155.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6177663803100586, + "sampling/importance_sampling_ratio/mean": 0.998638927936554, + "sampling/importance_sampling_ratio/min": 0.5489455461502075, + "sampling/sampling_logp_difference/max": 0.5997560024261475, + "sampling/sampling_logp_difference/mean": 0.015707600861787796, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 167.765625, + "completions/mean_terminated_length": 167.765625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.24882417917251587, + "epoch": 2.906862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3194867085940902, + "kl": 0.08847443759441376, + "learning_rate": 3.2443844257400434e-09, + "loss": 0.0056, + "num_tokens": 74566652.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8555169105529785, + "sampling/importance_sampling_ratio/mean": 1.000077247619629, + "sampling/importance_sampling_ratio/min": 0.3558327257633209, + "sampling/sampling_logp_difference/max": 1.0332945585250854, + "sampling/sampling_logp_difference/mean": 0.015010975301265717, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 217.015625, + "completions/mean_terminated_length": 217.015625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.28991562128067017, + "epoch": 2.9080882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7664279811921024, + "kl": 0.11535454541444778, + "learning_rate": 3.163866864688336e-09, + "loss": -0.0425, + "num_tokens": 74599309.0, + "reward": -0.125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6979293823242188, + "sampling/importance_sampling_ratio/mean": 1.0001277923583984, + "sampling/importance_sampling_ratio/min": 0.5260913968086243, + "sampling/sampling_logp_difference/max": 0.6422803401947021, + "sampling/sampling_logp_difference/mean": 0.014657038263976574, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 171.5, + "completions/mean_terminated_length": 171.5, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.26413846015930176, + "epoch": 2.909313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16315822509215297, + "kl": 0.0894489660859108, + "learning_rate": 3.0843578495113877e-09, + "loss": 0.0009, + "num_tokens": 74629101.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7033652067184448, + "sampling/importance_sampling_ratio/mean": 1.00035560131073, + "sampling/importance_sampling_ratio/min": 0.4520955979824066, + "sampling/sampling_logp_difference/max": 0.7938616275787354, + "sampling/sampling_logp_difference/mean": 0.015666788443922997, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 189.234375, + "completions/mean_terminated_length": 189.234375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.26266130805015564, + "epoch": 2.9105392156862746, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.2143216733494686, + "kl": 0.12630251049995422, + "learning_rate": 3.0058575416073707e-09, + "loss": -0.0105, + "num_tokens": 74657324.0, + "reward": 0.34375, + "reward_std": 0.7297805547714233, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5772355794906616, + "sampling/importance_sampling_ratio/mean": 0.9998902082443237, + "sampling/importance_sampling_ratio/min": 0.3374631702899933, + "sampling/sampling_logp_difference/max": 1.086298942565918, + "sampling/sampling_logp_difference/mean": 0.016290338709950447, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 143.703125, + "completions/mean_terminated_length": 143.703125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.22800227999687195, + "epoch": 2.911764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09451691999504142, + "kl": 0.09714032709598541, + "learning_rate": 2.9283661003270952e-09, + "loss": 0.001, + "num_tokens": 74684649.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7779202461242676, + "sampling/importance_sampling_ratio/mean": 1.0003443956375122, + "sampling/importance_sampling_ratio/min": 0.49009910225868225, + "sampling/sampling_logp_difference/max": 0.7131476402282715, + "sampling/sampling_logp_difference/mean": 0.013768637552857399, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 208.203125, + "completions/mean_terminated_length": 208.203125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2165367305278778, + "epoch": 2.9129901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06683402199550116, + "kl": 0.08643889427185059, + "learning_rate": 2.851883682973233e-09, + "loss": 0.0008, + "num_tokens": 74718646.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.647499680519104, + "sampling/importance_sampling_ratio/mean": 1.0002728700637817, + "sampling/importance_sampling_ratio/min": 0.2931484580039978, + "sampling/sampling_logp_difference/max": 1.2270760536193848, + "sampling/sampling_logp_difference/mean": 0.012979322113096714, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 181.84375, + "completions/mean_terminated_length": 181.84375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.22348137199878693, + "epoch": 2.9142156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06213794220960048, + "kl": 0.060592323541641235, + "learning_rate": 2.776410444800148e-09, + "loss": 0.0006, + "num_tokens": 74750524.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7699233293533325, + "sampling/importance_sampling_ratio/mean": 0.9999657869338989, + "sampling/importance_sampling_ratio/min": 0.4934474527835846, + "sampling/sampling_logp_difference/max": 0.7063388824462891, + "sampling/sampling_logp_difference/mean": 0.013729427009820938, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 188.640625, + "completions/mean_terminated_length": 188.640625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.23032495379447937, + "epoch": 2.9154411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1745762357725924, + "kl": 0.07656420767307281, + "learning_rate": 2.701946539013844e-09, + "loss": -0.0202, + "num_tokens": 74780053.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.9472484588623047, + "sampling/importance_sampling_ratio/mean": 1.0007221698760986, + "sampling/importance_sampling_ratio/min": 0.5678617358207703, + "sampling/sampling_logp_difference/max": 0.6664173603057861, + "sampling/sampling_logp_difference/mean": 0.013936182484030724, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 142.609375, + "completions/mean_terminated_length": 142.609375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2128639966249466, + "epoch": 2.9166666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4505295862517897, + "kl": 0.07548115402460098, + "learning_rate": 2.628492116771297e-09, + "loss": 0.0028, + "num_tokens": 74807724.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6647119522094727, + "sampling/importance_sampling_ratio/mean": 1.0005338191986084, + "sampling/importance_sampling_ratio/min": 0.6141372323036194, + "sampling/sampling_logp_difference/max": 0.5096521377563477, + "sampling/sampling_logp_difference/mean": 0.01334769744426012, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 194.1875, + "completions/mean_terminated_length": 194.1875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3141852021217346, + "epoch": 2.917892156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8320177654093657, + "kl": 0.1052158772945404, + "learning_rate": 2.556047327180344e-09, + "loss": -0.0205, + "num_tokens": 74836168.0, + "reward": 0.40625, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.623957633972168, + "sampling/importance_sampling_ratio/mean": 0.9996415376663208, + "sampling/importance_sampling_ratio/min": 0.3899306058883667, + "sampling/sampling_logp_difference/max": 0.941786527633667, + "sampling/sampling_logp_difference/mean": 0.017161313444375992, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 180.21875, + "completions/mean_terminated_length": 180.21875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.256213903427124, + "epoch": 2.9191176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1278551278739337, + "kl": 0.096051424741745, + "learning_rate": 2.484612317299295e-09, + "loss": -0.0088, + "num_tokens": 74865574.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7726800441741943, + "sampling/importance_sampling_ratio/mean": 0.9998409748077393, + "sampling/importance_sampling_ratio/min": 0.5763106942176819, + "sampling/sampling_logp_difference/max": 0.5724925994873047, + "sampling/sampling_logp_difference/mean": 0.016871415078639984, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 236.421875, + "completions/mean_terminated_length": 236.421875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2461642324924469, + "epoch": 2.920343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1292133916352909, + "kl": 0.1166759729385376, + "learning_rate": 2.4141872321367107e-09, + "loss": 0.0052, + "num_tokens": 74896753.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997732639312744, + "sampling/importance_sampling_ratio/min": 0.4644794166088104, + "sampling/sampling_logp_difference/max": 0.7668380737304688, + "sampling/sampling_logp_difference/mean": 0.014024095609784126, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 161.9375, + "completions/mean_terminated_length": 161.9375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.22293847799301147, + "epoch": 2.9215686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07028081816171995, + "kl": 0.07327209413051605, + "learning_rate": 2.344772214651014e-09, + "loss": 0.0007, + "num_tokens": 74930109.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7980595827102661, + "sampling/importance_sampling_ratio/mean": 1.000113844871521, + "sampling/importance_sampling_ratio/min": 0.44718384742736816, + "sampling/sampling_logp_difference/max": 0.8047854900360107, + "sampling/sampling_logp_difference/mean": 0.01504216343164444, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 228.828125, + "completions/mean_terminated_length": 228.828125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.18402422964572906, + "epoch": 2.922794117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1222707178536586, + "kl": 0.04598855972290039, + "learning_rate": 2.2763674057503235e-09, + "loss": -0.0076, + "num_tokens": 74968994.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994775056838989, + "sampling/importance_sampling_ratio/min": 0.49969446659088135, + "sampling/sampling_logp_difference/max": 0.8864250183105469, + "sampling/sampling_logp_difference/mean": 0.012235797941684723, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.2333841621875763, + "epoch": 2.924019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09732410717908493, + "kl": 0.09015602618455887, + "learning_rate": 2.20897294429212e-09, + "loss": 0.001, + "num_tokens": 74997018.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7521053552627563, + "sampling/importance_sampling_ratio/mean": 0.9996908903121948, + "sampling/importance_sampling_ratio/min": 0.65540611743927, + "sampling/sampling_logp_difference/max": 0.560818076133728, + "sampling/sampling_logp_difference/mean": 0.013704277575016022, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 190.3125, + "completions/mean_terminated_length": 190.3125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.25898271799087524, + "epoch": 2.9252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09993229418286245, + "kl": 0.08073795586824417, + "learning_rate": 2.142588967082748e-09, + "loss": 0.0009, + "num_tokens": 75029070.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9561160802841187, + "sampling/importance_sampling_ratio/mean": 1.0003554821014404, + "sampling/importance_sampling_ratio/min": 0.5336609482765198, + "sampling/sampling_logp_difference/max": 0.6709609031677246, + "sampling/sampling_logp_difference/mean": 0.014637565240263939, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 227.71875, + "completions/mean_terminated_length": 227.71875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.2954796850681305, + "epoch": 2.9264705882352944, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.1971779506717732, + "kl": 0.10312424600124359, + "learning_rate": 2.0772156088776913e-09, + "loss": -0.0036, + "num_tokens": 75059932.0, + "reward": 0.3125, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6147301197052002, + "sampling/importance_sampling_ratio/mean": 1.000575065612793, + "sampling/importance_sampling_ratio/min": 0.4161491394042969, + "sampling/sampling_logp_difference/max": 0.8767116069793701, + "sampling/sampling_logp_difference/mean": 0.01552021037787199, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 167.703125, + "completions/mean_terminated_length": 167.703125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3421614170074463, + "epoch": 2.9276960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.370010336418139, + "kl": 0.10211165994405746, + "learning_rate": 2.0128530023804656e-09, + "loss": 0.0125, + "num_tokens": 75090329.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5467345714569092, + "sampling/importance_sampling_ratio/mean": 1.0004173517227173, + "sampling/importance_sampling_ratio/min": 0.5122334361076355, + "sampling/sampling_logp_difference/max": 0.6689748764038086, + "sampling/sampling_logp_difference/mean": 0.01810724101960659, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 276.734375, + "completions/mean_terminated_length": 276.734375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.24409303069114685, + "epoch": 2.928921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.366713240450411, + "kl": 0.0857454314827919, + "learning_rate": 1.9495012782433375e-09, + "loss": -0.0039, + "num_tokens": 75130648.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.8357044458389282, + "sampling/importance_sampling_ratio/mean": 1.0000190734863281, + "sampling/importance_sampling_ratio/min": 0.526208758354187, + "sampling/sampling_logp_difference/max": 0.6420572996139526, + "sampling/sampling_logp_difference/mean": 0.014364847913384438, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 176.765625, + "completions/mean_terminated_length": 176.765625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.19778773188591003, + "epoch": 2.9301470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053425809659322615, + "kl": 0.08036743104457855, + "learning_rate": 1.887160565066048e-09, + "loss": 0.0007, + "num_tokens": 75159241.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.959877610206604, + "sampling/importance_sampling_ratio/mean": 1.000080943107605, + "sampling/importance_sampling_ratio/min": 0.1343042552471161, + "sampling/sampling_logp_difference/max": 2.0076475143432617, + "sampling/sampling_logp_difference/mean": 0.013936562463641167, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 254.890625, + "completions/mean_terminated_length": 254.890625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.16103404760360718, + "epoch": 2.931372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0610412820395003, + "kl": 0.06309305131435394, + "learning_rate": 1.8258309893965374e-09, + "loss": 0.0006, + "num_tokens": 75197602.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.755321741104126, + "sampling/importance_sampling_ratio/mean": 0.9997216463088989, + "sampling/importance_sampling_ratio/min": 0.6123639941215515, + "sampling/sampling_logp_difference/max": 0.5626522302627563, + "sampling/sampling_logp_difference/mean": 0.01104649156332016, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 155.265625, + "completions/mean_terminated_length": 155.265625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.2115069031715393, + "epoch": 2.9325980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04934542419140079, + "kl": 0.07161717116832733, + "learning_rate": 1.7655126757297744e-09, + "loss": 0.0007, + "num_tokens": 75226099.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994068145751953, + "sampling/importance_sampling_ratio/min": 0.475801557302475, + "sampling/sampling_logp_difference/max": 0.8327982425689697, + "sampling/sampling_logp_difference/mean": 0.013303788378834724, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 146.09375, + "completions/mean_terminated_length": 146.09375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.23196184635162354, + "epoch": 2.9338235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6438645232529383, + "kl": 0.0847909152507782, + "learning_rate": 1.7062057465082046e-09, + "loss": -0.0107, + "num_tokens": 75253577.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5488466024398804, + "sampling/importance_sampling_ratio/mean": 0.9996697306632996, + "sampling/importance_sampling_ratio/min": 0.5592991709709167, + "sampling/sampling_logp_difference/max": 0.5810707807540894, + "sampling/sampling_logp_difference/mean": 0.014680958352982998, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 158.5625, + "completions/mean_terminated_length": 158.5625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.24769297242164612, + "epoch": 2.935049019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2540197987817803, + "kl": 0.09263478219509125, + "learning_rate": 1.6479103221211377e-09, + "loss": 0.0056, + "num_tokens": 75283101.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000531673431396, + "sampling/importance_sampling_ratio/min": 0.6368657350540161, + "sampling/sampling_logp_difference/max": 0.8148508071899414, + "sampling/sampling_logp_difference/mean": 0.01475741621106863, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 230.015625, + "completions/mean_terminated_length": 230.015625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.27384263277053833, + "epoch": 2.936274509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042707395194060215, + "kl": 0.07987666875123978, + "learning_rate": 1.5906265209045254e-09, + "loss": 0.0008, + "num_tokens": 75315470.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940991401672, + "sampling/importance_sampling_ratio/min": 0.5005773305892944, + "sampling/sampling_logp_difference/max": 1.0181095600128174, + "sampling/sampling_logp_difference/mean": 0.0167281124740839, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 222.9375, + "completions/mean_terminated_length": 222.9375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.19160006940364838, + "epoch": 2.9375, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4533464685523405, + "kl": 0.059094592928886414, + "learning_rate": 1.534354459140963e-09, + "loss": 0.0399, + "num_tokens": 75343930.0, + "reward": 0.15625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6478482484817505, + "sampling/importance_sampling_ratio/mean": 0.9995725154876709, + "sampling/importance_sampling_ratio/min": 0.48819631338119507, + "sampling/sampling_logp_difference/max": 0.7170376777648926, + "sampling/sampling_logp_difference/mean": 0.010771173983812332, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 168.1875, + "completions/mean_terminated_length": 168.1875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.1975494623184204, + "epoch": 2.938725490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03824227447430867, + "kl": 0.06701640039682388, + "learning_rate": 1.4790942510590766e-09, + "loss": 0.0007, + "num_tokens": 75371606.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5930945873260498, + "sampling/importance_sampling_ratio/mean": 0.9998019933700562, + "sampling/importance_sampling_ratio/min": 0.6237468719482422, + "sampling/sampling_logp_difference/max": 0.47201061248779297, + "sampling/sampling_logp_difference/mean": 0.012317480519413948, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 152.203125, + "completions/mean_terminated_length": 152.203125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.20233231782913208, + "epoch": 2.939950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3729668722016508, + "kl": 0.07118867337703705, + "learning_rate": 1.4248460088335801e-09, + "loss": -0.0233, + "num_tokens": 75397795.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006823539733887, + "sampling/importance_sampling_ratio/min": 0.3063874840736389, + "sampling/sampling_logp_difference/max": 1.1829047203063965, + "sampling/sampling_logp_difference/mean": 0.013798831030726433, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 248.015625, + "completions/mean_terminated_length": 248.015625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.2584313452243805, + "epoch": 2.9411764705882355, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.697534943531724, + "kl": 0.07394785434007645, + "learning_rate": 1.371609842585053e-09, + "loss": 0.028, + "num_tokens": 75432196.0, + "reward": 0.8125, + "reward_std": 0.5123475193977356, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5889581441879272, + "sampling/importance_sampling_ratio/mean": 1.0002371072769165, + "sampling/importance_sampling_ratio/min": 0.5168574452400208, + "sampling/sampling_logp_difference/max": 0.6599881649017334, + "sampling/sampling_logp_difference/mean": 0.013856764882802963, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 189.234375, + "completions/mean_terminated_length": 189.234375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.3257656991481781, + "epoch": 2.9424019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10361650664180576, + "kl": 0.12545722723007202, + "learning_rate": 1.319385860379496e-09, + "loss": 0.0012, + "num_tokens": 75466611.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996939897537231, + "sampling/importance_sampling_ratio/min": 0.4234618544578552, + "sampling/sampling_logp_difference/max": 0.9860789775848389, + "sampling/sampling_logp_difference/mean": 0.018942903727293015, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 207.171875, + "completions/mean_terminated_length": 207.171875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2865198850631714, + "epoch": 2.943627450980392, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9337615033533684, + "kl": 0.09686718881130219, + "learning_rate": 1.2681741682282754e-09, + "loss": -0.0054, + "num_tokens": 75494510.0, + "reward": 0.5, + "reward_std": 0.5651718378067017, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7766722440719604, + "sampling/importance_sampling_ratio/mean": 0.9999604225158691, + "sampling/importance_sampling_ratio/min": 0.6210312247276306, + "sampling/sampling_logp_difference/max": 0.5747420787811279, + "sampling/sampling_logp_difference/mean": 0.014227193780243397, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 229.640625, + "completions/mean_terminated_length": 229.640625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.23316679894924164, + "epoch": 2.9448529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07515850973576715, + "kl": 0.07214280962944031, + "learning_rate": 1.217974870087901e-09, + "loss": 0.0007, + "num_tokens": 75526903.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6321943998336792, + "sampling/importance_sampling_ratio/mean": 1.000157356262207, + "sampling/importance_sampling_ratio/min": 0.5540791749954224, + "sampling/sampling_logp_difference/max": 0.5904476642608643, + "sampling/sampling_logp_difference/mean": 0.013380090706050396, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 171.890625, + "completions/mean_terminated_length": 171.890625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.33587875962257385, + "epoch": 2.946078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3763860182258667, + "kl": 0.13876089453697205, + "learning_rate": 1.1687880678596939e-09, + "loss": 0.0202, + "num_tokens": 75563424.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.539456844329834, + "sampling/importance_sampling_ratio/mean": 1.0002048015594482, + "sampling/importance_sampling_ratio/min": 0.524482011795044, + "sampling/sampling_logp_difference/max": 0.6453441381454468, + "sampling/sampling_logp_difference/mean": 0.018524974584579468, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 160.296875, + "completions/mean_terminated_length": 160.296875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.23508617281913757, + "epoch": 2.9473039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6674638252877323, + "kl": 0.08367919921875, + "learning_rate": 1.1206138613898962e-09, + "loss": -0.0234, + "num_tokens": 75588867.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4926220178604126, + "sampling/importance_sampling_ratio/mean": 1.0001332759857178, + "sampling/importance_sampling_ratio/min": 0.44747617840766907, + "sampling/sampling_logp_difference/max": 0.8041319847106934, + "sampling/sampling_logp_difference/mean": 0.014118922874331474, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 182.671875, + "completions/mean_terminated_length": 182.671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.16823163628578186, + "epoch": 2.9485294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04092480094650376, + "kl": 0.056618332862854004, + "learning_rate": 1.0734523484689507e-09, + "loss": 0.0005, + "num_tokens": 75620318.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999382495880127, + "sampling/importance_sampling_ratio/min": 0.45940688252449036, + "sampling/sampling_logp_difference/max": 0.8144283294677734, + "sampling/sampling_logp_difference/mean": 0.011973707936704159, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 197.796875, + "completions/mean_terminated_length": 197.796875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2799832224845886, + "epoch": 2.9497549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0930479009365046, + "kl": 0.09062144160270691, + "learning_rate": 1.0273036248318324e-09, + "loss": 0.0163, + "num_tokens": 75650945.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001051425933838, + "sampling/importance_sampling_ratio/min": 0.6147287487983704, + "sampling/sampling_logp_difference/max": 0.766869068145752, + "sampling/sampling_logp_difference/mean": 0.013567798770964146, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 204.15625, + "completions/mean_terminated_length": 204.15625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.25723665952682495, + "epoch": 2.950980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4889937073059785, + "kl": 0.09684564918279648, + "learning_rate": 9.82167784157495e-10, + "loss": -0.0005, + "num_tokens": 75679355.0, + "reward": 0.34375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.62906014919281, + "sampling/importance_sampling_ratio/mean": 0.9999738335609436, + "sampling/importance_sampling_ratio/min": 0.6158067584037781, + "sampling/sampling_logp_difference/max": 0.4880032539367676, + "sampling/sampling_logp_difference/mean": 0.014383267611265182, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 193.65625, + "completions/mean_terminated_length": 193.65625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.252433717250824, + "epoch": 2.952205882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.963388265256947, + "kl": 0.10595382750034332, + "learning_rate": 9.380449180688143e-10, + "loss": -0.0252, + "num_tokens": 75708565.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998771548271179, + "sampling/importance_sampling_ratio/min": 0.36926916241645813, + "sampling/sampling_logp_difference/max": 0.9962295293807983, + "sampling/sampling_logp_difference/mean": 0.01537374872714281, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 241.625, + "completions/mean_terminated_length": 241.625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.21027690172195435, + "epoch": 2.9534313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03188025150667092, + "kl": 0.04365028068423271, + "learning_rate": 8.949351161324225e-10, + "loss": 0.0004, + "num_tokens": 75745885.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998771548271179, + "sampling/importance_sampling_ratio/min": 0.4754011332988739, + "sampling/sampling_logp_difference/max": 0.7435963153839111, + "sampling/sampling_logp_difference/mean": 0.012878429144620895, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 144.09375, + "completions/mean_terminated_length": 144.09375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.191091388463974, + "epoch": 2.954656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3793387235702022, + "kl": 0.12095391750335693, + "learning_rate": 8.528384658584853e-10, + "loss": 0.0054, + "num_tokens": 75769843.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.9066637754440308, + "sampling/importance_sampling_ratio/mean": 0.9999489784240723, + "sampling/importance_sampling_ratio/min": 0.4309251308441162, + "sampling/sampling_logp_difference/max": 0.8418209552764893, + "sampling/sampling_logp_difference/mean": 0.012651410885155201, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 218.53125, + "completions/mean_terminated_length": 218.53125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.23037388920783997, + "epoch": 2.9558823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9820654691587366, + "kl": 0.05584049969911575, + "learning_rate": 8.117550527005912e-10, + "loss": 0.0105, + "num_tokens": 75799893.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000017762184143, + "sampling/importance_sampling_ratio/min": 0.626641035079956, + "sampling/sampling_logp_difference/max": 0.7562308311462402, + "sampling/sampling_logp_difference/mean": 0.012850666418671608, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 145.515625, + "completions/mean_terminated_length": 145.515625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.272765189409256, + "epoch": 2.957107843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.2109855999852184, + "kl": 0.11711627244949341, + "learning_rate": 7.716849600554188e-10, + "loss": 0.013, + "num_tokens": 75826294.0, + "reward": 0.40625, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.9099805355072021, + "sampling/importance_sampling_ratio/mean": 0.9996508359909058, + "sampling/importance_sampling_ratio/min": 0.544747531414032, + "sampling/sampling_logp_difference/max": 0.6470930576324463, + "sampling/sampling_logp_difference/mean": 0.016956236213445663, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 183.078125, + "completions/mean_terminated_length": 183.078125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.20906969904899597, + "epoch": 2.9583333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3978070372573852, + "kl": 0.08074396848678589, + "learning_rate": 7.326282692626806e-10, + "loss": 0.0072, + "num_tokens": 75852347.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5071555376052856, + "sampling/importance_sampling_ratio/mean": 0.999751091003418, + "sampling/importance_sampling_ratio/min": 0.5765681266784668, + "sampling/sampling_logp_difference/max": 0.5506618022918701, + "sampling/sampling_logp_difference/mean": 0.012953568249940872, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 246.515625, + "completions/mean_terminated_length": 246.515625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.31527090072631836, + "epoch": 2.9595588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.638447493144575, + "kl": 0.0897536426782608, + "learning_rate": 6.945850596050684e-10, + "loss": 0.0045, + "num_tokens": 75884444.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.650807499885559, + "sampling/importance_sampling_ratio/mean": 0.9997353553771973, + "sampling/importance_sampling_ratio/min": 0.5791006684303284, + "sampling/sampling_logp_difference/max": 0.5462789535522461, + "sampling/sampling_logp_difference/mean": 0.015875663608312607, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 161.40625, + "completions/mean_terminated_length": 161.40625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.2676832973957062, + "epoch": 2.9607843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7306499516456963, + "kl": 0.08243481814861298, + "learning_rate": 6.575554083078083e-10, + "loss": 0.0248, + "num_tokens": 75911254.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.697697401046753, + "sampling/importance_sampling_ratio/mean": 0.9998440146446228, + "sampling/importance_sampling_ratio/min": 0.6348111629486084, + "sampling/sampling_logp_difference/max": 0.5292727947235107, + "sampling/sampling_logp_difference/mean": 0.015132778324186802, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 182.859375, + "completions/mean_terminated_length": 182.859375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.1757541000843048, + "epoch": 2.9620098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.387450335820891, + "kl": 0.0584825724363327, + "learning_rate": 6.215393905388278e-10, + "loss": 0.2234, + "num_tokens": 75939133.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6519036293029785, + "sampling/importance_sampling_ratio/mean": 1.0005888938903809, + "sampling/importance_sampling_ratio/min": 0.5140793323516846, + "sampling/sampling_logp_difference/max": 0.6653776168823242, + "sampling/sampling_logp_difference/mean": 0.011330155655741692, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 186.46875, + "completions/mean_terminated_length": 186.46875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.2033308893442154, + "epoch": 2.963235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6699127491801236, + "kl": 0.07381125539541245, + "learning_rate": 5.865370794082558e-10, + "loss": -0.001, + "num_tokens": 75969115.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6007341146469116, + "sampling/importance_sampling_ratio/mean": 0.9995757937431335, + "sampling/importance_sampling_ratio/min": 0.6202810406684875, + "sampling/sampling_logp_difference/max": 0.4775826930999756, + "sampling/sampling_logp_difference/mean": 0.012664059177041054, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 178.640625, + "completions/mean_terminated_length": 178.640625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.2655785381793976, + "epoch": 2.9644607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.9247218081431783, + "kl": 0.1148318350315094, + "learning_rate": 5.525485459687007e-10, + "loss": 0.0007, + "num_tokens": 75997972.0, + "reward": 0.8125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999703049659729, + "sampling/importance_sampling_ratio/min": 0.5523868203163147, + "sampling/sampling_logp_difference/max": 0.8148813247680664, + "sampling/sampling_logp_difference/mean": 0.014968165196478367, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 198.1875, + "completions/mean_terminated_length": 198.1875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2387477457523346, + "epoch": 2.965686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9179642676561003, + "kl": 0.0756097286939621, + "learning_rate": 5.195738592145838e-10, + "loss": -0.0054, + "num_tokens": 76040528.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6207278966903687, + "sampling/importance_sampling_ratio/mean": 0.9994602799415588, + "sampling/importance_sampling_ratio/min": 0.6100690960884094, + "sampling/sampling_logp_difference/max": 0.4941830635070801, + "sampling/sampling_logp_difference/mean": 0.015103841200470924, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 229.015625, + "completions/mean_terminated_length": 229.015625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.2549667954444885, + "epoch": 2.9669117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03421188773261006, + "kl": 0.05629030987620354, + "learning_rate": 4.876130860825278e-10, + "loss": 0.0006, + "num_tokens": 76076129.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6196949481964111, + "sampling/importance_sampling_ratio/mean": 1.0002193450927734, + "sampling/importance_sampling_ratio/min": 0.5888904333114624, + "sampling/sampling_logp_difference/max": 0.5295150876045227, + "sampling/sampling_logp_difference/mean": 0.01382712833583355, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 177.78125, + "completions/mean_terminated_length": 177.78125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.19116365909576416, + "epoch": 2.968137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048813274652691925, + "kl": 0.04953814297914505, + "learning_rate": 4.566662914508579e-10, + "loss": 0.0005, + "num_tokens": 76104707.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9391807317733765, + "sampling/importance_sampling_ratio/mean": 0.9995027184486389, + "sampling/importance_sampling_ratio/min": 0.47432684898376465, + "sampling/sampling_logp_difference/max": 0.7458586692810059, + "sampling/sampling_logp_difference/mean": 0.012865998782217503, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 244.421875, + "completions/mean_terminated_length": 244.421875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.20995011925697327, + "epoch": 2.969362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5809898301512049, + "kl": 0.05042857304215431, + "learning_rate": 4.267335381396564e-10, + "loss": 0.0375, + "num_tokens": 76144974.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5133386850357056, + "sampling/importance_sampling_ratio/mean": 1.0000345706939697, + "sampling/importance_sampling_ratio/min": 0.47389116883277893, + "sampling/sampling_logp_difference/max": 0.7467775344848633, + "sampling/sampling_logp_difference/mean": 0.010932646691799164, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 206.65625, + "completions/mean_terminated_length": 206.65625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.26046937704086304, + "epoch": 2.9705882352941178, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.0295393323799478, + "kl": 0.1752389371395111, + "learning_rate": 3.978148869103748e-10, + "loss": -0.0007, + "num_tokens": 76176808.0, + "reward": 0.71875, + "reward_std": 0.5457825064659119, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.7038747072219849, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.5678154826164246, + "sampling/sampling_logp_difference/max": 0.5659587383270264, + "sampling/sampling_logp_difference/mean": 0.014540375210344791, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 189.234375, + "completions/mean_terminated_length": 189.234375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2247229814529419, + "epoch": 2.971813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03381290248002876, + "kl": 0.056775644421577454, + "learning_rate": 3.699103964661665e-10, + "loss": 0.0005, + "num_tokens": 76220871.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8061577081680298, + "sampling/importance_sampling_ratio/mean": 0.9999212622642517, + "sampling/importance_sampling_ratio/min": 0.5134853720664978, + "sampling/sampling_logp_difference/max": 0.6665337085723877, + "sampling/sampling_logp_difference/mean": 0.016165809705853462, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 159.546875, + "completions/mean_terminated_length": 159.546875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.24071826040744781, + "epoch": 2.9730392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4308604112635903, + "kl": 0.10873550176620483, + "learning_rate": 3.430201234513874e-10, + "loss": 0.0058, + "num_tokens": 76244666.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998641014099121, + "sampling/importance_sampling_ratio/min": 0.5392753481864929, + "sampling/sampling_logp_difference/max": 0.945838212966919, + "sampling/sampling_logp_difference/mean": 0.015373199246823788, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 187.1875, + "completions/mean_terminated_length": 187.1875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2494470477104187, + "epoch": 2.974264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045471425782333735, + "kl": 0.08607512712478638, + "learning_rate": 3.171441224514848e-10, + "loss": 0.0008, + "num_tokens": 76275046.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5827796459197998, + "sampling/importance_sampling_ratio/mean": 0.9993318915367126, + "sampling/importance_sampling_ratio/min": 0.5341923236846924, + "sampling/sampling_logp_difference/max": 0.6269993782043457, + "sampling/sampling_logp_difference/mean": 0.014485558494925499, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 179.765625, + "completions/mean_terminated_length": 179.765625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.2240767478942871, + "epoch": 2.9754901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5617583684394614, + "kl": 0.10678014159202576, + "learning_rate": 2.922824459931639e-10, + "loss": 0.057, + "num_tokens": 76307127.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999181032180786, + "sampling/importance_sampling_ratio/min": 0.5355904698371887, + "sampling/sampling_logp_difference/max": 0.9863035678863525, + "sampling/sampling_logp_difference/mean": 0.013244271278381348, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 198.609375, + "completions/mean_terminated_length": 198.609375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.29143062233924866, + "epoch": 2.9767156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.470008467109167, + "kl": 0.08310844004154205, + "learning_rate": 2.684351445440547e-10, + "loss": -0.043, + "num_tokens": 76342798.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6291046142578125, + "sampling/importance_sampling_ratio/mean": 1.000214695930481, + "sampling/importance_sampling_ratio/min": 0.5359659194946289, + "sampling/sampling_logp_difference/max": 0.6236846446990967, + "sampling/sampling_logp_difference/mean": 0.017195921391248703, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 190.140625, + "completions/mean_terminated_length": 190.140625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.29836857318878174, + "epoch": 2.9779411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3373957085876917, + "kl": 0.13529150187969208, + "learning_rate": 2.456022665127122e-10, + "loss": -0.0168, + "num_tokens": 76379095.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000542402267456, + "sampling/importance_sampling_ratio/min": 0.5902017951011658, + "sampling/sampling_logp_difference/max": 0.7390432357788086, + "sampling/sampling_logp_difference/mean": 0.015541539527475834, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 150.703125, + "completions/mean_terminated_length": 150.703125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2485143095254898, + "epoch": 2.9791666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6228619743834114, + "kl": 0.1274341642856598, + "learning_rate": 2.2378385824833866e-10, + "loss": -0.0128, + "num_tokens": 76409188.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.73202645778656, + "sampling/importance_sampling_ratio/mean": 1.000281572341919, + "sampling/importance_sampling_ratio/min": 0.48663073778152466, + "sampling/sampling_logp_difference/max": 0.7202496528625488, + "sampling/sampling_logp_difference/mean": 0.015785079449415207, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 212.75, + "completions/mean_terminated_length": 212.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.26056230068206787, + "epoch": 2.980392156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8957602438814687, + "kl": 0.0845237746834755, + "learning_rate": 2.0297996404095018e-10, + "loss": -0.113, + "num_tokens": 76439876.0, + "reward": 0.3125, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.4784256219863892, + "sampling/importance_sampling_ratio/mean": 0.999798059463501, + "sampling/importance_sampling_ratio/min": 0.3641431927680969, + "sampling/sampling_logp_difference/max": 1.0102081298828125, + "sampling/sampling_logp_difference/mean": 0.014546210877597332, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 173.078125, + "completions/mean_terminated_length": 173.078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.24924761056900024, + "epoch": 2.9816176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4017548869277197, + "kl": 0.08277080953121185, + "learning_rate": 1.8319062612115467e-10, + "loss": -0.0112, + "num_tokens": 76468889.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6413885354995728, + "sampling/importance_sampling_ratio/mean": 1.0004193782806396, + "sampling/importance_sampling_ratio/min": 0.5419471859931946, + "sampling/sampling_logp_difference/max": 0.6125867366790771, + "sampling/sampling_logp_difference/mean": 0.015157599002122879, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 179.828125, + "completions/mean_terminated_length": 179.828125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.2005600929260254, + "epoch": 2.982843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2288829197558995, + "kl": 0.056043677031993866, + "learning_rate": 1.6441588466009627e-10, + "loss": 0.0059, + "num_tokens": 76496974.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5744365453720093, + "sampling/importance_sampling_ratio/mean": 0.9994364976882935, + "sampling/importance_sampling_ratio/min": 0.6059094667434692, + "sampling/sampling_logp_difference/max": 0.5010247230529785, + "sampling/sampling_logp_difference/mean": 0.011820659041404724, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 169.203125, + "completions/mean_terminated_length": 169.203125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.242442786693573, + "epoch": 2.9840686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4555295883616384, + "kl": 0.08681453764438629, + "learning_rate": 1.4665577776923343e-10, + "loss": 0.0132, + "num_tokens": 76527131.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5895564556121826, + "sampling/importance_sampling_ratio/mean": 0.9990965127944946, + "sampling/importance_sampling_ratio/min": 0.35135895013809204, + "sampling/sampling_logp_difference/max": 1.0459469556808472, + "sampling/sampling_logp_difference/mean": 0.01459289900958538, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 175.890625, + "completions/mean_terminated_length": 175.890625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.2646161615848541, + "epoch": 2.985294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.446731364139672, + "kl": 0.09826575964689255, + "learning_rate": 1.2991034150050538e-10, + "loss": -0.0477, + "num_tokens": 76554276.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.8623839616775513, + "sampling/importance_sampling_ratio/mean": 1.0008070468902588, + "sampling/importance_sampling_ratio/min": 0.5893176198005676, + "sampling/sampling_logp_difference/max": 0.6218574047088623, + "sampling/sampling_logp_difference/mean": 0.013790035620331764, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 166.890625, + "completions/mean_terminated_length": 166.890625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.21010886132717133, + "epoch": 2.986519607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.1361755153214474, + "kl": 0.09595704823732376, + "learning_rate": 1.1417960984605457e-10, + "loss": 0.0223, + "num_tokens": 76580621.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992457032203674, + "sampling/importance_sampling_ratio/min": 0.5073873996734619, + "sampling/sampling_logp_difference/max": 0.7147841453552246, + "sampling/sampling_logp_difference/mean": 0.012746874243021011, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 229.0625, + "completions/mean_terminated_length": 229.0625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.23335005342960358, + "epoch": 2.9877450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6997078272872883, + "kl": 0.06975404918193817, + "learning_rate": 9.946361473822662e-11, + "loss": 0.0268, + "num_tokens": 76614641.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9622851610183716, + "sampling/importance_sampling_ratio/mean": 1.0005013942718506, + "sampling/importance_sampling_ratio/min": 0.06372610479593277, + "sampling/sampling_logp_difference/max": 2.7531609535217285, + "sampling/sampling_logp_difference/mean": 0.013777503743767738, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 208.0, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2805878520011902, + "epoch": 2.9889705882352944, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3875109591325976, + "kl": 0.08423937112092972, + "learning_rate": 8.576238604968144e-11, + "loss": 0.0179, + "num_tokens": 76648129.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998219013214111, + "sampling/importance_sampling_ratio/min": 0.5426336526870728, + "sampling/sampling_logp_difference/max": 0.7030305862426758, + "sampling/sampling_logp_difference/mean": 0.015688970685005188, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 151.140625, + "completions/mean_terminated_length": 151.140625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2604881227016449, + "epoch": 2.9901960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2521160841775165, + "kl": 0.11699055880308151, + "learning_rate": 7.307595159300461e-11, + "loss": -0.0004, + "num_tokens": 76677706.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.544237494468689, + "sampling/importance_sampling_ratio/mean": 0.9995912313461304, + "sampling/importance_sampling_ratio/min": 0.6103914976119995, + "sampling/sampling_logp_difference/max": 0.493654727935791, + "sampling/sampling_logp_difference/mean": 0.01679575815796852, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 223.625, + "completions/mean_terminated_length": 223.625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3009791970252991, + "epoch": 2.991421568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7276816385245755, + "kl": 0.1001376211643219, + "learning_rate": 6.140433712076287e-11, + "loss": -0.0574, + "num_tokens": 76712930.0, + "reward": 0.15625, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998260140419006, + "sampling/importance_sampling_ratio/min": 0.3920634984970093, + "sampling/sampling_logp_difference/max": 0.9363315105438232, + "sampling/sampling_logp_difference/mean": 0.01611517369747162, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 196.25, + "completions/mean_terminated_length": 196.25, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2741738557815552, + "epoch": 2.9926470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4004598692258408, + "kl": 0.09876975417137146, + "learning_rate": 5.074756632572619e-11, + "loss": 0.0142, + "num_tokens": 76742546.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.871956467628479, + "sampling/importance_sampling_ratio/mean": 1.0001269578933716, + "sampling/importance_sampling_ratio/min": 0.5572464466094971, + "sampling/sampling_logp_difference/max": 0.6269841194152832, + "sampling/sampling_logp_difference/mean": 0.01623227261006832, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 190.6875, + "completions/mean_terminated_length": 190.6875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2576693892478943, + "epoch": 2.993872549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.079763238699461, + "kl": 0.07150693237781525, + "learning_rate": 4.110566084036815e-11, + "loss": 0.0212, + "num_tokens": 76771534.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4220448732376099, + "sampling/importance_sampling_ratio/mean": 0.9988895058631897, + "sampling/importance_sampling_ratio/min": 0.6046943068504333, + "sampling/sampling_logp_difference/max": 0.5030322074890137, + "sampling/sampling_logp_difference/mean": 0.014783736318349838, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 211.5625, + "completions/mean_terminated_length": 211.5625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.30968382954597473, + "epoch": 2.9950980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.637648467447538, + "kl": 0.10707981139421463, + "learning_rate": 3.247864023719904e-11, + "loss": -0.0073, + "num_tokens": 76800914.0, + "reward": 0.5, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000529289245605, + "sampling/importance_sampling_ratio/min": 0.4752058982849121, + "sampling/sampling_logp_difference/max": 1.4294970035552979, + "sampling/sampling_logp_difference/mean": 0.015981096774339676, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 150.59375, + "completions/mean_terminated_length": 150.59375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.19654402136802673, + "epoch": 2.9963235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3421147106329778, + "kl": 0.10663546621799469, + "learning_rate": 2.4866522028488268e-11, + "loss": 0.0004, + "num_tokens": 76827384.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6062535047531128, + "sampling/importance_sampling_ratio/mean": 0.9999834895133972, + "sampling/importance_sampling_ratio/min": 0.47787678241729736, + "sampling/sampling_logp_difference/max": 0.7384023666381836, + "sampling/sampling_logp_difference/mean": 0.012441320344805717, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 186.046875, + "completions/mean_terminated_length": 186.046875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.26118266582489014, + "epoch": 2.997549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3243641853561114, + "kl": 0.11284665763378143, + "learning_rate": 1.8269321666375403e-11, + "loss": -0.0286, + "num_tokens": 76856539.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6207815408706665, + "sampling/importance_sampling_ratio/mean": 0.9997130036354065, + "sampling/importance_sampling_ratio/min": 0.3909359574317932, + "sampling/sampling_logp_difference/max": 0.9392115473747253, + "sampling/sampling_logp_difference/mean": 0.016125911846756935, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 214.484375, + "completions/mean_terminated_length": 214.484375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.24739396572113037, + "epoch": 2.998774509803922, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6065085595715245, + "kl": 0.09260021150112152, + "learning_rate": 1.2687052542759147e-11, + "loss": 0.0473, + "num_tokens": 76889114.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6634241342544556, + "sampling/importance_sampling_ratio/mean": 1.0005452632904053, + "sampling/importance_sampling_ratio/min": 0.5677346587181091, + "sampling/sampling_logp_difference/max": 0.56610107421875, + "sampling/sampling_logp_difference/mean": 0.014586443081498146, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 195.234375, + "completions/mean_terminated_length": 195.234375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.2355600744485855, + "epoch": 3.0, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2193175282367605, + "kl": 0.07594628632068634, + "learning_rate": 8.119725989241822e-12, + "loss": 0.0215, + "num_tokens": 76916233.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5596669912338257, + "sampling/importance_sampling_ratio/mean": 0.9998251795768738, + "sampling/importance_sampling_ratio/min": 0.5702371001243591, + "sampling/sampling_logp_difference/max": 0.561703085899353, + "sampling/sampling_logp_difference/mean": 0.013978696428239346, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 215.984375, + "completions/mean_terminated_length": 215.984375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2412678301334381, + "epoch": 3.0012254901960786, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.882924046267253, + "kl": 0.12087821215391159, + "learning_rate": 4.5673512772959055e-12, + "loss": 0.051, + "num_tokens": 76951288.0, + "reward": 0.53125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992916584014893, + "sampling/importance_sampling_ratio/min": 0.32853004336357117, + "sampling/sampling_logp_difference/max": 1.1131269931793213, + "sampling/sampling_logp_difference/mean": 0.014975108206272125, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 152.46875, + "completions/mean_terminated_length": 152.46875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.2571891248226166, + "epoch": 3.002450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7295956536656483, + "kl": 0.14601531624794006, + "learning_rate": 2.0299356179309666e-12, + "loss": 0.0302, + "num_tokens": 76984662.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5563182830810547, + "sampling/importance_sampling_ratio/mean": 0.9998385906219482, + "sampling/importance_sampling_ratio/min": 0.6069795489311218, + "sampling/sampling_logp_difference/max": 0.4992602467536926, + "sampling/sampling_logp_difference/mean": 0.015918415039777756, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 163.515625, + "completions/mean_terminated_length": 163.515625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.21576866507530212, + "epoch": 3.0036764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07391588468035754, + "kl": 0.06929220259189606, + "learning_rate": 5.074841620267278e-13, + "loss": 0.0007, + "num_tokens": 77010407.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5475221872329712, + "sampling/importance_sampling_ratio/mean": 0.9999107122421265, + "sampling/importance_sampling_ratio/min": 0.22164645791053772, + "sampling/sampling_logp_difference/max": 1.506671667098999, + "sampling/sampling_logp_difference/mean": 0.01383669301867485, + "step": 2451 + } + ], + "logging_steps": 1, + "max_steps": 2451, + "num_input_tokens_seen": 77010407, + "num_train_epochs": 4, + "save_steps": 817, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}