{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0036764705882355, "eval_steps": 500, "global_step": 2451, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 267.484375, "completions/mean_terminated_length": 267.484375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.145115926861763, "epoch": 0.0012254901960784314, "frac_reward_zero_std": 0.5, "grad_norm": 1.2699338046500006, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0224, "num_tokens": 32911.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.8768293857574463, "sampling/importance_sampling_ratio/mean": 1.0000336170196533, "sampling/importance_sampling_ratio/min": 0.4881851375102997, "sampling/sampling_logp_difference/max": 0.7170605659484863, "sampling/sampling_logp_difference/mean": 0.011007876135408878, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 184.265625, "completions/mean_terminated_length": 184.265625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.17698192596435547, "epoch": 0.0024509803921568627, "frac_reward_zero_std": 0.0, "grad_norm": 3.279908744405416, "kl": 0.0, "learning_rate": 4.065040650406504e-09, "loss": -0.0334, "num_tokens": 60896.0, "reward": -0.0625, "reward_std": 0.644389271736145, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992719888687134, "sampling/importance_sampling_ratio/min": 0.14117345213890076, "sampling/sampling_logp_difference/max": 1.9577659368515015, "sampling/sampling_logp_difference/mean": 0.015893325209617615, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 210.71875, "completions/mean_terminated_length": 210.71875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.21332260966300964, "epoch": 0.003676470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 3.5294807730188364, "kl": 0.0008067585877142847, "learning_rate": 8.130081300813008e-09, "loss": -0.0791, "num_tokens": 92574.0, "reward": 0.3125, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.7787456512451172, "sampling/importance_sampling_ratio/mean": 0.9998263120651245, "sampling/importance_sampling_ratio/min": 0.28856924176216125, "sampling/sampling_logp_difference/max": 1.2428202629089355, "sampling/sampling_logp_difference/mean": 0.016139939427375793, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 264.4375, "completions/mean_terminated_length": 264.4375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1841556429862976, "epoch": 0.004901960784313725, "frac_reward_zero_std": 0.25, "grad_norm": 1.9956682594403414, "kl": 0.0007476672763004899, "learning_rate": 1.2195121951219512e-08, "loss": -0.1092, "num_tokens": 128282.0, "reward": 0.375, "reward_std": 0.7236068248748779, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002386569976807, "sampling/importance_sampling_ratio/min": 0.3053293526172638, "sampling/sampling_logp_difference/max": 1.1863641738891602, "sampling/sampling_logp_difference/mean": 0.0141455614939332, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 152.421875, "completions/mean_terminated_length": 152.421875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2214486002922058, "epoch": 0.006127450980392157, "frac_reward_zero_std": 0.25, "grad_norm": 4.369864432046693, "kl": 0.001004523248411715, "learning_rate": 1.6260162601626016e-08, "loss": 0.0366, "num_tokens": 163285.0, "reward": 0.25, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9981650114059448, "sampling/importance_sampling_ratio/min": 0.3985414505004883, "sampling/sampling_logp_difference/max": 0.9357107877731323, "sampling/sampling_logp_difference/mean": 0.020896129310131073, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 166.28125, "completions/mean_terminated_length": 166.28125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23074975609779358, "epoch": 0.007352941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 3.3140508936905912, "kl": 0.0009867888875305653, "learning_rate": 2.032520325203252e-08, "loss": 0.0136, "num_tokens": 189351.0, "reward": -0.34375, "reward_std": 0.8705305457115173, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.956235408782959, "sampling/importance_sampling_ratio/mean": 0.9999337196350098, "sampling/importance_sampling_ratio/min": 0.39710769057273865, "sampling/sampling_logp_difference/max": 0.9235477447509766, "sampling/sampling_logp_difference/mean": 0.020348988473415375, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.1819380521774292, "epoch": 0.00857843137254902, "frac_reward_zero_std": 0.25, "grad_norm": 2.4418243651013998, "kl": 0.0008182962192222476, "learning_rate": 2.4390243902439023e-08, "loss": -0.0953, "num_tokens": 220759.0, "reward": 0.40625, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.871976613998413, "sampling/importance_sampling_ratio/mean": 0.9996786713600159, "sampling/importance_sampling_ratio/min": 0.31916162371635437, "sampling/sampling_logp_difference/max": 1.1420576572418213, "sampling/sampling_logp_difference/mean": 0.014583440497517586, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 192.546875, "completions/mean_terminated_length": 192.546875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1745167374610901, "epoch": 0.00980392156862745, "frac_reward_zero_std": 0.25, "grad_norm": 2.5788609427873816, "kl": 0.0007142078247852623, "learning_rate": 2.8455284552845527e-08, "loss": -0.0171, "num_tokens": 250730.0, "reward": 0.34375, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.574067234992981, "sampling/importance_sampling_ratio/mean": 1.0004377365112305, "sampling/importance_sampling_ratio/min": 0.5096371173858643, "sampling/sampling_logp_difference/max": 0.6740564107894897, "sampling/sampling_logp_difference/mean": 0.012199976481497288, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 187.140625, "completions/mean_terminated_length": 187.140625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1829577535390854, "epoch": 0.011029411764705883, "frac_reward_zero_std": 0.25, "grad_norm": 2.8171989512948765, "kl": 0.0006950580282136798, "learning_rate": 3.252032520325203e-08, "loss": -0.164, "num_tokens": 284835.0, "reward": 0.21875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001541376113892, "sampling/importance_sampling_ratio/min": 0.25570473074913025, "sampling/sampling_logp_difference/max": 1.363731861114502, "sampling/sampling_logp_difference/mean": 0.014619017019867897, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 199.390625, "completions/mean_terminated_length": 199.390625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.21946877241134644, "epoch": 0.012254901960784314, "frac_reward_zero_std": 0.25, "grad_norm": 2.743495906786532, "kl": 0.0010361624881625175, "learning_rate": 3.658536585365853e-08, "loss": -0.0196, "num_tokens": 318204.0, "reward": 0.25, "reward_std": 0.6613117456436157, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999617338180542, "sampling/importance_sampling_ratio/min": 0.40798985958099365, "sampling/sampling_logp_difference/max": 0.8965129852294922, "sampling/sampling_logp_difference/mean": 0.017877453938126564, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 172.296875, "completions/mean_terminated_length": 172.296875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.21682016551494598, "epoch": 0.013480392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 2.7792153496967664, "kl": 0.0012470419751480222, "learning_rate": 4.065040650406504e-08, "loss": -0.0067, "num_tokens": 345983.0, "reward": 0.09375, "reward_std": 0.686570405960083, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997990131378174, "sampling/importance_sampling_ratio/min": 0.15214312076568604, "sampling/sampling_logp_difference/max": 1.8829336166381836, "sampling/sampling_logp_difference/mean": 0.018758177757263184, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 186.734375, "completions/mean_terminated_length": 186.734375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.18953701853752136, "epoch": 0.014705882352941176, "frac_reward_zero_std": 0.25, "grad_norm": 3.035123873225246, "kl": 0.0011587527114897966, "learning_rate": 4.4715447154471546e-08, "loss": 0.0796, "num_tokens": 374830.0, "reward": 0.59375, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007586479187012, "sampling/importance_sampling_ratio/min": 0.3381035625934601, "sampling/sampling_logp_difference/max": 1.0844030380249023, "sampling/sampling_logp_difference/mean": 0.014523649588227272, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 172.078125, "completions/mean_terminated_length": 172.078125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1656114161014557, "epoch": 0.015931372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 2.417069846130118, "kl": 0.000998029951006174, "learning_rate": 4.878048780487805e-08, "loss": 0.0026, "num_tokens": 400211.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995077848434448, "sampling/importance_sampling_ratio/min": 0.2681380808353424, "sampling/sampling_logp_difference/max": 1.3162531852722168, "sampling/sampling_logp_difference/mean": 0.01516179833561182, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.16202491521835327, "epoch": 0.01715686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 2.213437710938307, "kl": 0.0008014775812625885, "learning_rate": 5.2845528455284554e-08, "loss": -0.0109, "num_tokens": 426503.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.8623807430267334, "sampling/importance_sampling_ratio/mean": 0.9993404746055603, "sampling/importance_sampling_ratio/min": 0.0011495015351101756, "sampling/sampling_logp_difference/max": 6.768426895141602, "sampling/sampling_logp_difference/mean": 0.01480356976389885, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 147.390625, "completions/mean_terminated_length": 147.390625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.12848389148712158, "epoch": 0.01838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.008026646180956696, "kl": 0.000677362666465342, "learning_rate": 5.6910569105691055e-08, "loss": 0.0, "num_tokens": 453488.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6391452550888062, "sampling/importance_sampling_ratio/mean": 0.9992384910583496, "sampling/importance_sampling_ratio/min": 0.31635162234306335, "sampling/sampling_logp_difference/max": 1.150900959968567, "sampling/sampling_logp_difference/mean": 0.011211428791284561, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 249.453125, "completions/mean_terminated_length": 249.453125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.23030908405780792, "epoch": 0.0196078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.4556318068591032, "kl": 0.0008972191135399044, "learning_rate": 6.097560975609756e-08, "loss": 0.0407, "num_tokens": 495101.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994782209396362, "sampling/importance_sampling_ratio/min": 0.3743397891521454, "sampling/sampling_logp_difference/max": 0.9825913906097412, "sampling/sampling_logp_difference/mean": 0.017919588834047318, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 155.734375, "completions/mean_terminated_length": 155.734375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.18947651982307434, "epoch": 0.020833333333333332, "frac_reward_zero_std": 0.5, "grad_norm": 2.904796813624881, "kl": 0.0007599001983180642, "learning_rate": 6.504065040650406e-08, "loss": 0.0041, "num_tokens": 521468.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.9906654357910156, "sampling/importance_sampling_ratio/mean": 0.9998794794082642, "sampling/importance_sampling_ratio/min": 0.4074186682701111, "sampling/sampling_logp_difference/max": 0.897913932800293, "sampling/sampling_logp_difference/mean": 0.015055290423333645, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 188.46875, "completions/mean_terminated_length": 188.46875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.18024782836437225, "epoch": 0.022058823529411766, "frac_reward_zero_std": 0.25, "grad_norm": 2.5232328359620357, "kl": 0.0013627070002257824, "learning_rate": 6.910569105691057e-08, "loss": -0.0298, "num_tokens": 548234.0, "reward": 0.25, "reward_std": 0.6494960784912109, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007586479187012, "sampling/importance_sampling_ratio/min": 0.09287115931510925, "sampling/sampling_logp_difference/max": 2.376542091369629, "sampling/sampling_logp_difference/mean": 0.016794255003333092, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 220.34375, "completions/mean_terminated_length": 220.34375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.20761245489120483, "epoch": 0.023284313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 2.2247091470788702, "kl": 0.0009089668747037649, "learning_rate": 7.317073170731706e-08, "loss": -0.0216, "num_tokens": 582880.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8837299346923828, "sampling/importance_sampling_ratio/mean": 0.9993168115615845, "sampling/importance_sampling_ratio/min": 0.28969964385032654, "sampling/sampling_logp_difference/max": 1.2389106750488281, "sampling/sampling_logp_difference/mean": 0.016906775534152985, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 167.796875, "completions/mean_terminated_length": 167.796875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1678135246038437, "epoch": 0.024509803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 2.2937421687772095, "kl": 0.0010125580010935664, "learning_rate": 7.723577235772358e-08, "loss": -0.0078, "num_tokens": 613267.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992740750312805, "sampling/importance_sampling_ratio/min": 0.22527435421943665, "sampling/sampling_logp_difference/max": 1.490436315536499, "sampling/sampling_logp_difference/mean": 0.01392899826169014, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.19384238123893738, "epoch": 0.025735294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.324882947640868, "kl": 0.0007086207042448223, "learning_rate": 8.130081300813008e-08, "loss": 0.007, "num_tokens": 642203.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000007152557373, "sampling/importance_sampling_ratio/min": 0.5038077235221863, "sampling/sampling_logp_difference/max": 0.8750619888305664, "sampling/sampling_logp_difference/mean": 0.013608230277895927, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 175.171875, "completions/mean_terminated_length": 175.171875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.17411136627197266, "epoch": 0.02696078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.4113258242602515, "kl": 0.0009255572222173214, "learning_rate": 8.536585365853659e-08, "loss": 0.0121, "num_tokens": 673318.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998393058776855, "sampling/importance_sampling_ratio/min": 0.5363104939460754, "sampling/sampling_logp_difference/max": 0.7736172676086426, "sampling/sampling_logp_difference/mean": 0.013956751674413681, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 211.4375, "completions/mean_terminated_length": 211.4375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2113991677761078, "epoch": 0.028186274509803922, "frac_reward_zero_std": 0.25, "grad_norm": 2.2963084099370983, "kl": 0.001079519628547132, "learning_rate": 8.943089430894309e-08, "loss": -0.0252, "num_tokens": 706850.0, "reward": 0.28125, "reward_std": 0.7561737298965454, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994499683380127, "sampling/importance_sampling_ratio/min": 0.3145255744457245, "sampling/sampling_logp_difference/max": 1.1566898822784424, "sampling/sampling_logp_difference/mean": 0.016110863536596298, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 178.125, "completions/mean_terminated_length": 178.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.18932907283306122, "epoch": 0.029411764705882353, "frac_reward_zero_std": 0.25, "grad_norm": 2.4792697662005376, "kl": 0.0008806021651253104, "learning_rate": 9.349593495934959e-08, "loss": 0.0367, "num_tokens": 734266.0, "reward": 0.6875, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997655153274536, "sampling/importance_sampling_ratio/min": 0.37791651487350464, "sampling/sampling_logp_difference/max": 0.9730819463729858, "sampling/sampling_logp_difference/mean": 0.015410843305289745, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 218.546875, "completions/mean_terminated_length": 218.546875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.256826788187027, "epoch": 0.030637254901960783, "frac_reward_zero_std": 0.5, "grad_norm": 2.078362603626218, "kl": 0.0011094619985669851, "learning_rate": 9.75609756097561e-08, "loss": 0.0062, "num_tokens": 768893.0, "reward": -0.625, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": -0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6925945281982422, "sampling/importance_sampling_ratio/mean": 0.9995798468589783, "sampling/importance_sampling_ratio/min": 0.3771505653858185, "sampling/sampling_logp_difference/max": 0.9751107692718506, "sampling/sampling_logp_difference/mean": 0.018611162900924683, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 201.328125, "completions/mean_terminated_length": 201.328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1686311811208725, "epoch": 0.031862745098039214, "frac_reward_zero_std": 0.75, "grad_norm": 1.7713450102289707, "kl": 0.0007161884568631649, "learning_rate": 1.016260162601626e-07, "loss": -0.0072, "num_tokens": 798258.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5744632482528687, "sampling/importance_sampling_ratio/mean": 1.0003325939178467, "sampling/importance_sampling_ratio/min": 0.5094258785247803, "sampling/sampling_logp_difference/max": 0.6744709014892578, "sampling/sampling_logp_difference/mean": 0.012171566486358643, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 159.234375, "completions/mean_terminated_length": 159.234375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.18077212572097778, "epoch": 0.03308823529411765, "frac_reward_zero_std": 0.5, "grad_norm": 2.331492715904185, "kl": 0.001058193389326334, "learning_rate": 1.0569105691056911e-07, "loss": -0.0145, "num_tokens": 830337.0, "reward": 0.71875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7081413269042969, "sampling/importance_sampling_ratio/mean": 0.9999635219573975, "sampling/importance_sampling_ratio/min": 0.6173048615455627, "sampling/sampling_logp_difference/max": 0.5354058742523193, "sampling/sampling_logp_difference/mean": 0.015099374577403069, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 176.4375, "completions/mean_terminated_length": 176.4375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.23961573839187622, "epoch": 0.03431372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 2.416952272772911, "kl": 0.001167132519185543, "learning_rate": 1.097560975609756e-07, "loss": 0.0155, "num_tokens": 858221.0, "reward": 0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.9112063646316528, "sampling/importance_sampling_ratio/mean": 0.9995735883712769, "sampling/importance_sampling_ratio/min": 0.4739725589752197, "sampling/sampling_logp_difference/max": 0.7466058731079102, "sampling/sampling_logp_difference/mean": 0.01782134920358658, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 162.109375, "completions/mean_terminated_length": 162.109375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2027626484632492, "epoch": 0.03553921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 2.1486452460862857, "kl": 0.0009120207978412509, "learning_rate": 1.1382113821138211e-07, "loss": 0.0393, "num_tokens": 886804.0, "reward": 0.5625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.608196496963501, "sampling/importance_sampling_ratio/mean": 0.9993435144424438, "sampling/importance_sampling_ratio/min": 0.4156869351863861, "sampling/sampling_logp_difference/max": 0.8778228759765625, "sampling/sampling_logp_difference/mean": 0.016218479722738266, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 175.296875, "completions/mean_terminated_length": 175.296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.20189282298088074, "epoch": 0.03676470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 3.42404206856289, "kl": 0.0011729395482689142, "learning_rate": 1.1788617886178862e-07, "loss": -0.0321, "num_tokens": 911463.0, "reward": 0.09375, "reward_std": 0.8273203372955322, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996466636657715, "sampling/importance_sampling_ratio/min": 0.24144580960273743, "sampling/sampling_logp_difference/max": 1.4211102724075317, "sampling/sampling_logp_difference/mean": 0.01663786731660366, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 242.421875, "completions/mean_terminated_length": 242.421875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.1681455820798874, "epoch": 0.03799019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.7403563125874317, "kl": 0.009450777433812618, "learning_rate": 1.219512195121951e-07, "loss": 0.0433, "num_tokens": 941970.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996355772018433, "sampling/importance_sampling_ratio/min": 0.008333723060786724, "sampling/sampling_logp_difference/max": 4.787445068359375, "sampling/sampling_logp_difference/mean": 0.015643514692783356, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 160.578125, "completions/mean_terminated_length": 160.578125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.14362278580665588, "epoch": 0.0392156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.5064949900405952, "kl": 0.0027769720181822777, "learning_rate": 1.260162601626016e-07, "loss": 0.0, "num_tokens": 977127.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999289512634277, "sampling/importance_sampling_ratio/min": 0.049872394651174545, "sampling/sampling_logp_difference/max": 2.9982876777648926, "sampling/sampling_logp_difference/mean": 0.013318167068064213, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 237.484375, "completions/mean_terminated_length": 237.484375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.17521731555461884, "epoch": 0.04044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.008297787487699215, "kl": 0.0007624527206644416, "learning_rate": 1.3008130081300813e-07, "loss": 0.0, "num_tokens": 1011142.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6099501848220825, "sampling/importance_sampling_ratio/mean": 0.9996030926704407, "sampling/importance_sampling_ratio/min": 0.3408905267715454, "sampling/sampling_logp_difference/max": 1.076193928718567, "sampling/sampling_logp_difference/mean": 0.012307427823543549, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 222.578125, "completions/mean_terminated_length": 222.578125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.20559880137443542, "epoch": 0.041666666666666664, "frac_reward_zero_std": 0.25, "grad_norm": 2.013657529944585, "kl": 0.000846860115416348, "learning_rate": 1.3414634146341465e-07, "loss": 0.0086, "num_tokens": 1045355.0, "reward": 0.0, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8357250690460205, "sampling/importance_sampling_ratio/mean": 0.9997804164886475, "sampling/importance_sampling_ratio/min": 0.2929272949695587, "sampling/sampling_logp_difference/max": 1.2278308868408203, "sampling/sampling_logp_difference/mean": 0.014453263953328133, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 169.34375, "completions/mean_terminated_length": 169.34375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.19829398393630981, "epoch": 0.0428921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 2.505141941859476, "kl": 0.0011524150613695383, "learning_rate": 1.3821138211382114e-07, "loss": 0.0141, "num_tokens": 1075969.0, "reward": -0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999667763710022, "sampling/importance_sampling_ratio/min": 0.22803999483585358, "sampling/sampling_logp_difference/max": 1.4782342910766602, "sampling/sampling_logp_difference/mean": 0.016655966639518738, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 205.53125, "completions/mean_terminated_length": 205.53125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.19215545058250427, "epoch": 0.04411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.4767662046376377, "kl": 0.0009019374847412109, "learning_rate": 1.4227642276422763e-07, "loss": 0.0085, "num_tokens": 1109795.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8890724182128906, "sampling/importance_sampling_ratio/mean": 1.0002727508544922, "sampling/importance_sampling_ratio/min": 0.37968289852142334, "sampling/sampling_logp_difference/max": 0.9684188365936279, "sampling/sampling_logp_difference/mean": 0.01590348407626152, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.270089328289032, "epoch": 0.04534313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.3259173987424338, "kl": 0.0008042281260713935, "learning_rate": 1.4634146341463413e-07, "loss": -0.0085, "num_tokens": 1143731.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002659559249878, "sampling/importance_sampling_ratio/min": 0.5362957715988159, "sampling/sampling_logp_difference/max": 0.9607458114624023, "sampling/sampling_logp_difference/mean": 0.017081955447793007, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 165.03125, "completions/mean_terminated_length": 165.03125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.20545266568660736, "epoch": 0.04656862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.5826552693603972, "kl": 0.0007720981957390904, "learning_rate": 1.5040650406504065e-07, "loss": -0.0142, "num_tokens": 1175077.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994447231292725, "sampling/importance_sampling_ratio/min": 0.459695965051651, "sampling/sampling_logp_difference/max": 0.7771899700164795, "sampling/sampling_logp_difference/mean": 0.015556670725345612, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 229.265625, "completions/mean_terminated_length": 229.265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2712585926055908, "epoch": 0.04779411764705882, "frac_reward_zero_std": 0.25, "grad_norm": 2.353506040765074, "kl": 0.0009125759825110435, "learning_rate": 1.5447154471544717e-07, "loss": -0.0731, "num_tokens": 1204102.0, "reward": 0.0625, "reward_std": 0.6663130521774292, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7431355714797974, "sampling/importance_sampling_ratio/mean": 1.000112771987915, "sampling/importance_sampling_ratio/min": 0.3292010724544525, "sampling/sampling_logp_difference/max": 1.1110866069793701, "sampling/sampling_logp_difference/mean": 0.019008290022611618, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 162.796875, "completions/mean_terminated_length": 162.796875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.20898710191249847, "epoch": 0.049019607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 2.6237440741233096, "kl": 0.0029189821798354387, "learning_rate": 1.5853658536585366e-07, "loss": -0.0096, "num_tokens": 1232585.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992484450340271, "sampling/importance_sampling_ratio/min": 0.04195699468255043, "sampling/sampling_logp_difference/max": 3.171110153198242, "sampling/sampling_logp_difference/mean": 0.0167709868401289, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 205.0625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.2982996702194214, "epoch": 0.05024509803921569, "frac_reward_zero_std": 0.25, "grad_norm": 2.799568407655128, "kl": 0.0009631913271732628, "learning_rate": 1.6260162601626016e-07, "loss": 0.0097, "num_tokens": 1261949.0, "reward": -0.21875, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994151592254639, "sampling/importance_sampling_ratio/min": 0.5006861686706543, "sampling/sampling_logp_difference/max": 0.8760700225830078, "sampling/sampling_logp_difference/mean": 0.01940348744392395, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 183.546875, "completions/mean_terminated_length": 183.546875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.15424934029579163, "epoch": 0.051470588235294115, "frac_reward_zero_std": 0.5, "grad_norm": 1.7897617291631476, "kl": 0.0008602460147812963, "learning_rate": 1.6666666666666665e-07, "loss": -0.0108, "num_tokens": 1291360.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0012483596801758, "sampling/importance_sampling_ratio/min": 0.3941444158554077, "sampling/sampling_logp_difference/max": 0.9310379028320312, "sampling/sampling_logp_difference/mean": 0.013946986757218838, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 150.640625, "completions/mean_terminated_length": 150.640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.14799076318740845, "epoch": 0.05269607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 3.3545311634479824, "kl": 0.001798890414647758, "learning_rate": 1.7073170731707317e-07, "loss": 0.0144, "num_tokens": 1315833.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005621910095215, "sampling/importance_sampling_ratio/min": 0.0856546014547348, "sampling/sampling_logp_difference/max": 2.457432270050049, "sampling/sampling_logp_difference/mean": 0.01602208986878395, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 191.59375, "completions/mean_terminated_length": 191.59375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.21230532228946686, "epoch": 0.05392156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 2.311298023398624, "kl": 0.0005751050775870681, "learning_rate": 1.7479674796747966e-07, "loss": -0.0063, "num_tokens": 1346527.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7553319931030273, "sampling/importance_sampling_ratio/mean": 0.9997537136077881, "sampling/importance_sampling_ratio/min": 0.4791052043437958, "sampling/sampling_logp_difference/max": 0.735835075378418, "sampling/sampling_logp_difference/mean": 0.013450969010591507, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 206.921875, "completions/mean_terminated_length": 206.921875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2584640681743622, "epoch": 0.05514705882352941, "frac_reward_zero_std": 0.25, "grad_norm": 2.588613093482011, "kl": 0.0011465022107586265, "learning_rate": 1.7886178861788619e-07, "loss": -0.0703, "num_tokens": 1389658.0, "reward": 0.5, "reward_std": 0.6116957664489746, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003669261932373, "sampling/importance_sampling_ratio/min": 0.0739443451166153, "sampling/sampling_logp_difference/max": 2.604442596435547, "sampling/sampling_logp_difference/mean": 0.018720664083957672, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 149.28125, "completions/mean_terminated_length": 149.28125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.13874804973602295, "epoch": 0.056372549019607844, "frac_reward_zero_std": 1.0, "grad_norm": 0.01525103126378843, "kl": 0.001152846380136907, "learning_rate": 1.8292682926829268e-07, "loss": 0.0, "num_tokens": 1411132.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.894408106803894, "sampling/importance_sampling_ratio/mean": 0.9993637800216675, "sampling/importance_sampling_ratio/min": 0.3645811975002289, "sampling/sampling_logp_difference/max": 1.0090060234069824, "sampling/sampling_logp_difference/mean": 0.013426810503005981, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 223.140625, "completions/mean_terminated_length": 223.140625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.20380157232284546, "epoch": 0.05759803921568627, "frac_reward_zero_std": 0.25, "grad_norm": 2.266503743158167, "kl": 0.0008492980268783867, "learning_rate": 1.8699186991869917e-07, "loss": -0.0133, "num_tokens": 1441365.0, "reward": 0.15625, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.781062364578247, "sampling/importance_sampling_ratio/mean": 1.0001952648162842, "sampling/importance_sampling_ratio/min": 0.301305890083313, "sampling/sampling_logp_difference/max": 1.1996291875839233, "sampling/sampling_logp_difference/mean": 0.014757132157683372, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 239.140625, "completions/mean_terminated_length": 239.140625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.1466808319091797, "epoch": 0.058823529411764705, "frac_reward_zero_std": 0.75, "grad_norm": 2.1656890291326536, "kl": 0.000727055361494422, "learning_rate": 1.910569105691057e-07, "loss": 0.0332, "num_tokens": 1471422.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997957944869995, "sampling/importance_sampling_ratio/min": 0.2538463771343231, "sampling/sampling_logp_difference/max": 1.3710259199142456, "sampling/sampling_logp_difference/mean": 0.011648212559521198, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 148.03125, "completions/mean_terminated_length": 148.03125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.22562922537326813, "epoch": 0.06004901960784314, "frac_reward_zero_std": 0.5, "grad_norm": 2.96917802973487, "kl": 0.0011090695625171065, "learning_rate": 1.951219512195122e-07, "loss": 0.0079, "num_tokens": 1499280.0, "reward": 0.3125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000557899475098, "sampling/importance_sampling_ratio/min": 0.430244505405426, "sampling/sampling_logp_difference/max": 0.8434016704559326, "sampling/sampling_logp_difference/mean": 0.01785694807767868, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 146.265625, "completions/mean_terminated_length": 146.265625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.17815472185611725, "epoch": 0.061274509803921566, "frac_reward_zero_std": 0.5, "grad_norm": 3.046318973647959, "kl": 0.0008414412150159478, "learning_rate": 1.9918699186991868e-07, "loss": 0.0334, "num_tokens": 1525313.0, "reward": 0.5, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007548332214355, "sampling/importance_sampling_ratio/min": 0.3567883372306824, "sampling/sampling_logp_difference/max": 1.030612587928772, "sampling/sampling_logp_difference/mean": 0.0150204598903656, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 161.265625, "completions/mean_terminated_length": 161.265625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.21071355044841766, "epoch": 0.0625, "frac_reward_zero_std": 0.25, "grad_norm": 2.8969991490441087, "kl": 0.0011610030196607113, "learning_rate": 2.032520325203252e-07, "loss": -0.0127, "num_tokens": 1553170.0, "reward": 0.71875, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993498921394348, "sampling/importance_sampling_ratio/min": 0.48236799240112305, "sampling/sampling_logp_difference/max": 0.7290480136871338, "sampling/sampling_logp_difference/mean": 0.01719040796160698, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 235.4375, "completions/mean_terminated_length": 235.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.26339125633239746, "epoch": 0.06372549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 2.5234770083205347, "kl": 0.001336791436187923, "learning_rate": 2.073170731707317e-07, "loss": 0.0031, "num_tokens": 1594062.0, "reward": 0.3125, "reward_std": 0.5847553014755249, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996271729469299, "sampling/importance_sampling_ratio/min": 0.170707106590271, "sampling/sampling_logp_difference/max": 1.767806053161621, "sampling/sampling_logp_difference/mean": 0.021252326667308807, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 161.125, "completions/mean_terminated_length": 161.125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.23058223724365234, "epoch": 0.06495098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 1.943638404142557, "kl": 0.0011498222593218088, "learning_rate": 2.1138211382113822e-07, "loss": 0.001, "num_tokens": 1625990.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000584363937378, "sampling/importance_sampling_ratio/min": 0.4440935552120209, "sampling/sampling_logp_difference/max": 1.2157726287841797, "sampling/sampling_logp_difference/mean": 0.017671234905719757, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 176.78125, "completions/mean_terminated_length": 176.78125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2276969850063324, "epoch": 0.0661764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.9465295598557404, "kl": 0.0011563922744244337, "learning_rate": 2.154471544715447e-07, "loss": -0.0184, "num_tokens": 1652504.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000014305114746, "sampling/importance_sampling_ratio/min": 0.3149414360523224, "sampling/sampling_logp_difference/max": 1.1553685665130615, "sampling/sampling_logp_difference/mean": 0.016492661088705063, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2518764138221741, "epoch": 0.06740196078431372, "frac_reward_zero_std": 0.5, "grad_norm": 2.0451391458360093, "kl": 0.0009489314979873598, "learning_rate": 2.195121951219512e-07, "loss": -0.0357, "num_tokens": 1689400.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.623826026916504, "sampling/importance_sampling_ratio/mean": 1.0000336170196533, "sampling/importance_sampling_ratio/min": 0.3736537992954254, "sampling/sampling_logp_difference/max": 0.9844256639480591, "sampling/sampling_logp_difference/mean": 0.018169749528169632, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 184.953125, "completions/mean_terminated_length": 184.953125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.15880240499973297, "epoch": 0.06862745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.015698177885746345, "kl": 0.0010644992580637336, "learning_rate": 2.235772357723577e-07, "loss": 0.0, "num_tokens": 1720149.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995625019073486, "sampling/importance_sampling_ratio/min": 0.29187583923339844, "sampling/sampling_logp_difference/max": 1.23142671585083, "sampling/sampling_logp_difference/mean": 0.014251098968088627, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 189.859375, "completions/mean_terminated_length": 189.859375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.20032057166099548, "epoch": 0.06985294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.9733158470984498, "kl": 0.001350488979369402, "learning_rate": 2.2764227642276422e-07, "loss": -0.0297, "num_tokens": 1749228.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.98750638961792, "sampling/importance_sampling_ratio/mean": 1.0003879070281982, "sampling/importance_sampling_ratio/min": 0.36696475744247437, "sampling/sampling_logp_difference/max": 1.0024895668029785, "sampling/sampling_logp_difference/mean": 0.01593562588095665, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.19529198110103607, "epoch": 0.07107843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.794206828484955, "kl": 0.0013890969567000866, "learning_rate": 2.3170731707317074e-07, "loss": -0.0165, "num_tokens": 1773924.0, "reward": -0.15625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998757243156433, "sampling/importance_sampling_ratio/min": 0.4793126881122589, "sampling/sampling_logp_difference/max": 0.7354021072387695, "sampling/sampling_logp_difference/mean": 0.015716655179858208, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 189.890625, "completions/mean_terminated_length": 189.890625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.142284095287323, "epoch": 0.07230392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.2566331067579215, "kl": 0.0008152315858751535, "learning_rate": 2.3577235772357723e-07, "loss": -0.0102, "num_tokens": 1800413.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002665519714355, "sampling/importance_sampling_ratio/min": 0.459773987531662, "sampling/sampling_logp_difference/max": 1.1954364776611328, "sampling/sampling_logp_difference/mean": 0.011489486321806908, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 183.1875, "completions/mean_terminated_length": 183.1875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.24612563848495483, "epoch": 0.07352941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 2.469674402816741, "kl": 0.0011279808823019266, "learning_rate": 2.3983739837398373e-07, "loss": -0.0085, "num_tokens": 1826601.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992374181747437, "sampling/importance_sampling_ratio/min": 0.3642270565032959, "sampling/sampling_logp_difference/max": 1.0099778175354004, "sampling/sampling_logp_difference/mean": 0.01788927987217903, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 196.609375, "completions/mean_terminated_length": 196.609375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.18494541943073273, "epoch": 0.07475490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 2.342195339421481, "kl": 0.0011641057208180428, "learning_rate": 2.439024390243902e-07, "loss": 0.0125, "num_tokens": 1859008.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.923892617225647, "sampling/importance_sampling_ratio/mean": 1.000486135482788, "sampling/importance_sampling_ratio/min": 0.24550510942935944, "sampling/sampling_logp_difference/max": 1.40443754196167, "sampling/sampling_logp_difference/mean": 0.01435195654630661, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 178.765625, "completions/mean_terminated_length": 178.765625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.17995566129684448, "epoch": 0.07598039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 2.266713412472842, "kl": 0.0013068044790998101, "learning_rate": 2.479674796747967e-07, "loss": 0.0098, "num_tokens": 1888129.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002384185791016, "sampling/importance_sampling_ratio/min": 0.336388498544693, "sampling/sampling_logp_difference/max": 1.1315093040466309, "sampling/sampling_logp_difference/mean": 0.016184909269213676, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 151.484375, "completions/mean_terminated_length": 151.484375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.18587751686573029, "epoch": 0.07720588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 2.4786554405848515, "kl": 0.0009879636345431209, "learning_rate": 2.520325203252032e-07, "loss": -0.0322, "num_tokens": 1912080.0, "reward": -0.53125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": -0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.9112218618392944, "sampling/importance_sampling_ratio/mean": 1.0004534721374512, "sampling/importance_sampling_ratio/min": 0.23113895952701569, "sampling/sampling_logp_difference/max": 1.4647362232208252, "sampling/sampling_logp_difference/mean": 0.015230939723551273, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 151.78125, "completions/mean_terminated_length": 151.78125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.24000594019889832, "epoch": 0.0784313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 2.96170069401628, "kl": 0.0012934945989400148, "learning_rate": 2.5609756097560976e-07, "loss": 0.0851, "num_tokens": 1942722.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000030994415283, "sampling/importance_sampling_ratio/min": 0.3903812766075134, "sampling/sampling_logp_difference/max": 0.9406313896179199, "sampling/sampling_logp_difference/mean": 0.017442770302295685, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 224.609375, "completions/mean_terminated_length": 224.609375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.17163443565368652, "epoch": 0.07965686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 1.843371466059802, "kl": 0.0009480844018980861, "learning_rate": 2.6016260162601625e-07, "loss": -0.0052, "num_tokens": 1976409.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994733333587646, "sampling/importance_sampling_ratio/min": 0.4611442983150482, "sampling/sampling_logp_difference/max": 0.7740442752838135, "sampling/sampling_logp_difference/mean": 0.012315905652940273, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.21215350925922394, "epoch": 0.08088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 2.213617107549737, "kl": 0.0016216032672673464, "learning_rate": 2.6422764227642274e-07, "loss": 0.0318, "num_tokens": 2016729.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005450248718262, "sampling/importance_sampling_ratio/min": 0.3727061450481415, "sampling/sampling_logp_difference/max": 0.986965000629425, "sampling/sampling_logp_difference/mean": 0.01834452524781227, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 146.921875, "completions/mean_terminated_length": 146.921875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.18147198855876923, "epoch": 0.0821078431372549, "frac_reward_zero_std": 0.25, "grad_norm": 3.293958099941274, "kl": 0.0013055060990154743, "learning_rate": 2.682926829268293e-07, "loss": 0.0432, "num_tokens": 2042116.0, "reward": 0.375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6976975202560425, "sampling/importance_sampling_ratio/mean": 0.9998291730880737, "sampling/importance_sampling_ratio/min": 0.4755859076976776, "sampling/sampling_logp_difference/max": 0.7432076930999756, "sampling/sampling_logp_difference/mean": 0.016954217106103897, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 198.4375, "completions/mean_terminated_length": 198.4375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.22552774846553802, "epoch": 0.08333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 1.98609466690851, "kl": 0.0016288069309666753, "learning_rate": 2.7235772357723573e-07, "loss": 0.0065, "num_tokens": 2080208.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.7076270580291748, "sampling/importance_sampling_ratio/mean": 0.9996429085731506, "sampling/importance_sampling_ratio/min": 0.23136155307292938, "sampling/sampling_logp_difference/max": 1.4637736082077026, "sampling/sampling_logp_difference/mean": 0.01801292598247528, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 200.890625, "completions/mean_terminated_length": 200.890625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.23393692076206207, "epoch": 0.08455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 2.515330783548267, "kl": 0.0011547683971002698, "learning_rate": 2.764227642276423e-07, "loss": 0.0033, "num_tokens": 2109545.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998908042907715, "sampling/importance_sampling_ratio/min": 0.4810388684272766, "sampling/sampling_logp_difference/max": 0.8235739469528198, "sampling/sampling_logp_difference/mean": 0.017789803445339203, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 172.078125, "completions/mean_terminated_length": 172.078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.25954240560531616, "epoch": 0.0857843137254902, "frac_reward_zero_std": 0.0, "grad_norm": 2.7719277208100754, "kl": 0.001475530443713069, "learning_rate": 2.8048780487804877e-07, "loss": -0.0355, "num_tokens": 2143198.0, "reward": 0.3125, "reward_std": 0.7059217691421509, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6989006996154785, "sampling/importance_sampling_ratio/mean": 0.9999538660049438, "sampling/importance_sampling_ratio/min": 0.37945547699928284, "sampling/sampling_logp_difference/max": 0.9690179824829102, "sampling/sampling_logp_difference/mean": 0.017497912049293518, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.17055967450141907, "epoch": 0.08700980392156862, "frac_reward_zero_std": 0.5, "grad_norm": 2.0908010404916784, "kl": 0.0010060817003250122, "learning_rate": 2.8455284552845527e-07, "loss": -0.1028, "num_tokens": 2168814.0, "reward": 0.46875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8770861625671387, "sampling/importance_sampling_ratio/mean": 0.9999717473983765, "sampling/importance_sampling_ratio/min": 0.5698131918907166, "sampling/sampling_logp_difference/max": 0.6297206878662109, "sampling/sampling_logp_difference/mean": 0.011747198179364204, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 210.21875, "completions/mean_terminated_length": 210.21875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2961871027946472, "epoch": 0.08823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.827403572423837, "kl": 0.0019968498963862658, "learning_rate": 2.886178861788618e-07, "loss": 0.0022, "num_tokens": 2198684.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.807397484779358, "sampling/importance_sampling_ratio/mean": 1.00038743019104, "sampling/importance_sampling_ratio/min": 0.33141231536865234, "sampling/sampling_logp_difference/max": 1.1043920516967773, "sampling/sampling_logp_difference/mean": 0.019413597881793976, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 253.5625, "completions/mean_terminated_length": 253.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2353845089673996, "epoch": 0.08946078431372549, "frac_reward_zero_std": 0.25, "grad_norm": 2.1551761848321918, "kl": 0.001148139126598835, "learning_rate": 2.9268292682926825e-07, "loss": 0.0357, "num_tokens": 2233728.0, "reward": -0.3125, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6350189447402954, "sampling/importance_sampling_ratio/mean": 0.9997013807296753, "sampling/importance_sampling_ratio/min": 0.42648062109947205, "sampling/sampling_logp_difference/max": 0.8521883487701416, "sampling/sampling_logp_difference/mean": 0.01445393543690443, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 208.71875, "completions/mean_terminated_length": 208.71875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.17318938672542572, "epoch": 0.09068627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 1.7886602733080035, "kl": 0.001269905362278223, "learning_rate": 2.967479674796748e-07, "loss": 0.0193, "num_tokens": 2260334.0, "reward": 0.15625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7259013652801514, "sampling/importance_sampling_ratio/mean": 0.9998394250869751, "sampling/importance_sampling_ratio/min": 0.39690473675727844, "sampling/sampling_logp_difference/max": 0.9240590333938599, "sampling/sampling_logp_difference/mean": 0.014188411645591259, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 203.5625, "completions/mean_terminated_length": 203.5625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.20890933275222778, "epoch": 0.09191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.684631997120434, "kl": 0.0016565986443310976, "learning_rate": 3.008130081300813e-07, "loss": 0.001, "num_tokens": 2290786.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6600470542907715, "sampling/importance_sampling_ratio/mean": 0.9995492696762085, "sampling/importance_sampling_ratio/min": 0.336929053068161, "sampling/sampling_logp_difference/max": 1.0878829956054688, "sampling/sampling_logp_difference/mean": 0.015820588916540146, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 168.46875, "completions/mean_terminated_length": 168.46875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.16517484188079834, "epoch": 0.09313725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 2.299386141823514, "kl": 0.001505703548900783, "learning_rate": 3.048780487804878e-07, "loss": 0.0102, "num_tokens": 2315920.0, "reward": -0.28125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999821186065674, "sampling/importance_sampling_ratio/min": 0.2679966688156128, "sampling/sampling_logp_difference/max": 1.3167808055877686, "sampling/sampling_logp_difference/mean": 0.013711988925933838, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 208.59375, "completions/mean_terminated_length": 208.59375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.1949157863855362, "epoch": 0.09436274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.2605270490870344, "kl": 0.0017528904136270285, "learning_rate": 3.0894308943089434e-07, "loss": -0.0173, "num_tokens": 2346630.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999854326248169, "sampling/importance_sampling_ratio/min": 0.29550692439079285, "sampling/sampling_logp_difference/max": 1.2540783882141113, "sampling/sampling_logp_difference/mean": 0.015238778665661812, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 251.453125, "completions/mean_terminated_length": 251.453125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.20913855731487274, "epoch": 0.09558823529411764, "frac_reward_zero_std": 0.25, "grad_norm": 2.0026285862178974, "kl": 0.0016085922252386808, "learning_rate": 3.130081300813008e-07, "loss": -0.0103, "num_tokens": 2377267.0, "reward": -0.28125, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010101795196533, "sampling/importance_sampling_ratio/min": 0.5038118958473206, "sampling/sampling_logp_difference/max": 0.7567176818847656, "sampling/sampling_logp_difference/mean": 0.015605229884386063, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 197.8125, "completions/mean_terminated_length": 197.8125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.24096760153770447, "epoch": 0.09681372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 2.7601953560179315, "kl": 0.0015045834006741643, "learning_rate": 3.170731707317073e-07, "loss": 0.0355, "num_tokens": 2411895.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999889135360718, "sampling/importance_sampling_ratio/min": 0.47943076491355896, "sampling/sampling_logp_difference/max": 0.8563418388366699, "sampling/sampling_logp_difference/mean": 0.016151513904333115, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 186.90625, "completions/mean_terminated_length": 186.90625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.22716647386550903, "epoch": 0.09803921568627451, "frac_reward_zero_std": 0.0, "grad_norm": 3.111168991316798, "kl": 0.002549254335463047, "learning_rate": 3.211382113821138e-07, "loss": 0.0048, "num_tokens": 2443393.0, "reward": 0.5, "reward_std": 0.843070387840271, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998606443405151, "sampling/importance_sampling_ratio/min": 0.24906150996685028, "sampling/sampling_logp_difference/max": 1.3900554180145264, "sampling/sampling_logp_difference/mean": 0.018136531114578247, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 135.546875, "completions/mean_terminated_length": 135.546875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.17298495769500732, "epoch": 0.09926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 3.258324059124309, "kl": 0.002044468652456999, "learning_rate": 3.252032520325203e-07, "loss": 0.0254, "num_tokens": 2470196.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005346536636353, "sampling/importance_sampling_ratio/min": 0.458995521068573, "sampling/sampling_logp_difference/max": 0.9160494804382324, "sampling/sampling_logp_difference/mean": 0.01386941596865654, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18176504969596863, "epoch": 0.10049019607843138, "frac_reward_zero_std": 0.5, "grad_norm": 2.0259646735635353, "kl": 0.0018528061918914318, "learning_rate": 3.292682926829268e-07, "loss": 0.0875, "num_tokens": 2496772.0, "reward": -0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.766034722328186, "sampling/importance_sampling_ratio/mean": 0.9997857213020325, "sampling/importance_sampling_ratio/min": 0.5311629772186279, "sampling/sampling_logp_difference/max": 0.6326863765716553, "sampling/sampling_logp_difference/mean": 0.01508853118866682, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 192.34375, "completions/mean_terminated_length": 192.34375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.18329501152038574, "epoch": 0.1017156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.778822142019011, "kl": 0.0023378990590572357, "learning_rate": 3.333333333333333e-07, "loss": 0.001, "num_tokens": 2523226.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995636343955994, "sampling/importance_sampling_ratio/min": 0.4812057614326477, "sampling/sampling_logp_difference/max": 0.7314603328704834, "sampling/sampling_logp_difference/mean": 0.015016846358776093, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 239.4375, "completions/mean_terminated_length": 239.4375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.21985948085784912, "epoch": 0.10294117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 1.8214459082790309, "kl": 0.0020694350823760033, "learning_rate": 3.3739837398373985e-07, "loss": -0.0622, "num_tokens": 2560038.0, "reward": -0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997300505638123, "sampling/importance_sampling_ratio/min": 0.3163585066795349, "sampling/sampling_logp_difference/max": 1.1508792638778687, "sampling/sampling_logp_difference/mean": 0.014631749130785465, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 241.359375, "completions/mean_terminated_length": 241.359375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2028217315673828, "epoch": 0.10416666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 2.2121606808342307, "kl": 0.002727421699091792, "learning_rate": 3.4146341463414634e-07, "loss": 0.0238, "num_tokens": 2597037.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.825548529624939, "sampling/importance_sampling_ratio/mean": 0.9998873472213745, "sampling/importance_sampling_ratio/min": 0.21376019716262817, "sampling/sampling_logp_difference/max": 1.5429004430770874, "sampling/sampling_logp_difference/mean": 0.015795081853866577, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 155.171875, "completions/mean_terminated_length": 155.171875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.16458287835121155, "epoch": 0.1053921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.026919855382934496, "kl": 0.0018414882943034172, "learning_rate": 3.4552845528455284e-07, "loss": 0.0, "num_tokens": 2622520.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9431533813476562, "sampling/importance_sampling_ratio/mean": 1.0006672143936157, "sampling/importance_sampling_ratio/min": 0.42132270336151123, "sampling/sampling_logp_difference/max": 0.8643562197685242, "sampling/sampling_logp_difference/mean": 0.01315943244844675, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 273.34375, "completions/mean_terminated_length": 273.34375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.23761431872844696, "epoch": 0.10661764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.419811549150165, "kl": 0.0020909798331558704, "learning_rate": 3.4959349593495933e-07, "loss": 0.021, "num_tokens": 2657998.0, "reward": 0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002392530441284, "sampling/importance_sampling_ratio/min": 0.4424218535423279, "sampling/sampling_logp_difference/max": 0.8322451114654541, "sampling/sampling_logp_difference/mean": 0.01570003293454647, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 233.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.14569368958473206, "epoch": 0.10784313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.9791468306525047, "kl": 0.001645655371248722, "learning_rate": 3.536585365853658e-07, "loss": -0.0023, "num_tokens": 2687702.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.9099587202072144, "sampling/importance_sampling_ratio/mean": 1.00053071975708, "sampling/importance_sampling_ratio/min": 0.29515644907951355, "sampling/sampling_logp_difference/max": 1.2202496528625488, "sampling/sampling_logp_difference/mean": 0.012327494099736214, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 206.796875, "completions/mean_terminated_length": 206.796875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.13218270242214203, "epoch": 0.1090686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 2.066458335616022, "kl": 0.0034873075783252716, "learning_rate": 3.5772357723577237e-07, "loss": -0.0872, "num_tokens": 2719641.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9223304986953735, "sampling/importance_sampling_ratio/mean": 0.998964786529541, "sampling/importance_sampling_ratio/min": 0.24123281240463257, "sampling/sampling_logp_difference/max": 1.4219927787780762, "sampling/sampling_logp_difference/mean": 0.01427120715379715, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 236.03125, "completions/mean_terminated_length": 236.03125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2090870440006256, "epoch": 0.11029411764705882, "frac_reward_zero_std": 0.25, "grad_norm": 2.4617604302787153, "kl": 0.0022184555418789387, "learning_rate": 3.6178861788617886e-07, "loss": 0.0033, "num_tokens": 2753547.0, "reward": 0.40625, "reward_std": 0.6046693325042725, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993847608566284, "sampling/importance_sampling_ratio/min": 0.07257959991693497, "sampling/sampling_logp_difference/max": 2.6230714321136475, "sampling/sampling_logp_difference/mean": 0.015504513867199421, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 203.328125, "completions/mean_terminated_length": 203.328125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.21814948320388794, "epoch": 0.11151960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.3855837266028832, "kl": 0.003080436959862709, "learning_rate": 3.6585365853658536e-07, "loss": 0.036, "num_tokens": 2785984.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.697697401046753, "sampling/importance_sampling_ratio/mean": 1.000431776046753, "sampling/importance_sampling_ratio/min": 0.5824047923088074, "sampling/sampling_logp_difference/max": 0.5405895709991455, "sampling/sampling_logp_difference/mean": 0.01569124311208725, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 157.9375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.15251809358596802, "epoch": 0.11274509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.3646620053083798, "kl": 0.0027233543805778027, "learning_rate": 3.6991869918699185e-07, "loss": -0.0658, "num_tokens": 2809436.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.9242669343948364, "sampling/importance_sampling_ratio/mean": 1.0009275674819946, "sampling/importance_sampling_ratio/min": 0.3941422700881958, "sampling/sampling_logp_difference/max": 0.9310433864593506, "sampling/sampling_logp_difference/mean": 0.011663028970360756, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 248.28125, "completions/mean_terminated_length": 248.28125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.20643439888954163, "epoch": 0.11397058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 2.231436724426304, "kl": 0.003451679367572069, "learning_rate": 3.7398373983739835e-07, "loss": -0.046, "num_tokens": 2844286.0, "reward": 0.46875, "reward_std": 0.8837460875511169, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000553131103516, "sampling/importance_sampling_ratio/min": 0.29556503891944885, "sampling/sampling_logp_difference/max": 1.2188663482666016, "sampling/sampling_logp_difference/mean": 0.015196739695966244, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 152.84375, "completions/mean_terminated_length": 152.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.14843550324440002, "epoch": 0.11519607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 2.5282982854898655, "kl": 0.002533209975808859, "learning_rate": 3.7804878048780484e-07, "loss": 0.0155, "num_tokens": 2867652.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7544697523117065, "sampling/importance_sampling_ratio/mean": 1.0003670454025269, "sampling/importance_sampling_ratio/min": 0.5090111494064331, "sampling/sampling_logp_difference/max": 0.6752853393554688, "sampling/sampling_logp_difference/mean": 0.012290108948946, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 252.328125, "completions/mean_terminated_length": 252.328125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.21648448705673218, "epoch": 0.11642156862745098, "frac_reward_zero_std": 0.0, "grad_norm": 2.1210581745746024, "kl": 0.00387467909604311, "learning_rate": 3.821138211382114e-07, "loss": -0.0213, "num_tokens": 2902793.0, "reward": 0.34375, "reward_std": 0.747555673122406, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996301531791687, "sampling/importance_sampling_ratio/min": 0.4053426682949066, "sampling/sampling_logp_difference/max": 1.3121697902679443, "sampling/sampling_logp_difference/mean": 0.015475263819098473, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 142.421875, "completions/mean_terminated_length": 142.421875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.2054484337568283, "epoch": 0.11764705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 2.2365132525166236, "kl": 0.00443034153431654, "learning_rate": 3.861788617886179e-07, "loss": -0.0237, "num_tokens": 2932836.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.873075246810913, "sampling/importance_sampling_ratio/mean": 0.9996028542518616, "sampling/importance_sampling_ratio/min": 0.11423374712467194, "sampling/sampling_logp_difference/max": 2.169508457183838, "sampling/sampling_logp_difference/mean": 0.017275255173444748, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 185.5625, "completions/mean_terminated_length": 185.5625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.1749388575553894, "epoch": 0.11887254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.7714766396209334, "kl": 0.002680524019524455, "learning_rate": 3.902439024390244e-07, "loss": -0.0616, "num_tokens": 2961608.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999269425868988, "sampling/importance_sampling_ratio/min": 0.495414674282074, "sampling/sampling_logp_difference/max": 0.7522246837615967, "sampling/sampling_logp_difference/mean": 0.013655086979269981, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.22369414567947388, "epoch": 0.12009803921568628, "frac_reward_zero_std": 0.25, "grad_norm": 2.3642297674506656, "kl": 0.004713735543191433, "learning_rate": 3.9430894308943087e-07, "loss": -0.039, "num_tokens": 2989848.0, "reward": 0.4375, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001537799835205, "sampling/importance_sampling_ratio/min": 0.06123197078704834, "sampling/sampling_logp_difference/max": 2.793085813522339, "sampling/sampling_logp_difference/mean": 0.016208482906222343, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 190.171875, "completions/mean_terminated_length": 190.171875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.184893399477005, "epoch": 0.1213235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 2.2527448488608326, "kl": 0.002585100941359997, "learning_rate": 3.9837398373983736e-07, "loss": 0.0081, "num_tokens": 3018771.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7099417448043823, "sampling/importance_sampling_ratio/mean": 0.999454140663147, "sampling/importance_sampling_ratio/min": 0.38532912731170654, "sampling/sampling_logp_difference/max": 0.9536573886871338, "sampling/sampling_logp_difference/mean": 0.016417233273386955, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 237.203125, "completions/mean_terminated_length": 237.203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22887080907821655, "epoch": 0.12254901960784313, "frac_reward_zero_std": 0.5, "grad_norm": 1.7804555425711541, "kl": 0.0029146044980734587, "learning_rate": 4.024390243902439e-07, "loss": 0.0017, "num_tokens": 3053056.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001249313354492, "sampling/importance_sampling_ratio/min": 0.49485504627227783, "sampling/sampling_logp_difference/max": 0.8862013816833496, "sampling/sampling_logp_difference/mean": 0.01566164568066597, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 204.46875, "completions/mean_terminated_length": 204.46875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2525796890258789, "epoch": 0.12377450980392157, "frac_reward_zero_std": 0.25, "grad_norm": 2.717185626522858, "kl": 0.003638624679297209, "learning_rate": 4.065040650406504e-07, "loss": 0.0101, "num_tokens": 3085022.0, "reward": 0.5625, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995367527008057, "sampling/importance_sampling_ratio/min": 0.5260770916938782, "sampling/sampling_logp_difference/max": 0.8979051113128662, "sampling/sampling_logp_difference/mean": 0.018047038465738297, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 270.453125, "completions/mean_terminated_length": 195.38096618652344, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.1673162579536438, "epoch": 0.125, "frac_reward_zero_std": 0.5, "grad_norm": 1.6617256672581135, "kl": 0.003493118565529585, "learning_rate": 4.105691056910569e-07, "loss": 0.5592, "num_tokens": 3121419.0, "reward": -0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996997117996216, "sampling/importance_sampling_ratio/min": 0.1413872390985489, "sampling/sampling_logp_difference/max": 1.9562528133392334, "sampling/sampling_logp_difference/mean": 0.01538526639342308, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 135.90625, "completions/mean_terminated_length": 135.90625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.19178907573223114, "epoch": 0.12622549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 2.9304998229222012, "kl": 0.004775335546582937, "learning_rate": 4.146341463414634e-07, "loss": 0.0087, "num_tokens": 3150517.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8355188369750977, "sampling/importance_sampling_ratio/mean": 1.0008076429367065, "sampling/importance_sampling_ratio/min": 0.40564560890197754, "sampling/sampling_logp_difference/max": 0.9022754430770874, "sampling/sampling_logp_difference/mean": 0.0178519319742918, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 196.859375, "completions/mean_terminated_length": 196.859375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.21618320047855377, "epoch": 0.12745098039215685, "frac_reward_zero_std": 0.25, "grad_norm": 2.7597974618964067, "kl": 0.002705794293433428, "learning_rate": 4.186991869918699e-07, "loss": 0.1055, "num_tokens": 3181788.0, "reward": 0.375, "reward_std": 0.5651718378067017, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001839399337769, "sampling/importance_sampling_ratio/min": 0.23663082718849182, "sampling/sampling_logp_difference/max": 1.4412540197372437, "sampling/sampling_logp_difference/mean": 0.016408627852797508, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 259.0625, "completions/mean_terminated_length": 259.0625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.19014139473438263, "epoch": 0.12867647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.5398076256891096, "kl": 0.0030213571153581142, "learning_rate": 4.2276422764227643e-07, "loss": 0.0671, "num_tokens": 3217744.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.9693632125854492, "sampling/importance_sampling_ratio/mean": 1.0007412433624268, "sampling/importance_sampling_ratio/min": 0.3723587989807129, "sampling/sampling_logp_difference/max": 0.9878973960876465, "sampling/sampling_logp_difference/mean": 0.014151263982057571, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 151.921875, "completions/mean_terminated_length": 151.921875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.17414289712905884, "epoch": 0.12990196078431374, "frac_reward_zero_std": 0.75, "grad_norm": 1.9964054092101362, "kl": 0.0042853644117712975, "learning_rate": 4.268292682926829e-07, "loss": 0.0603, "num_tokens": 3242923.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5903403759002686, "sampling/importance_sampling_ratio/mean": 1.0001468658447266, "sampling/importance_sampling_ratio/min": 0.20117127895355225, "sampling/sampling_logp_difference/max": 1.6035985946655273, "sampling/sampling_logp_difference/mean": 0.016937680542469025, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 243.296875, "completions/mean_terminated_length": 243.296875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.14538052678108215, "epoch": 0.13112745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.776854554423682, "kl": 0.0027929027564823627, "learning_rate": 4.308943089430894e-07, "loss": 0.0225, "num_tokens": 3283790.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999587535858154, "sampling/importance_sampling_ratio/min": 0.43359288573265076, "sampling/sampling_logp_difference/max": 1.1120898723602295, "sampling/sampling_logp_difference/mean": 0.012797070667147636, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.1644519865512848, "epoch": 0.1323529411764706, "frac_reward_zero_std": 0.25, "grad_norm": 3.0964938129632356, "kl": 0.003369982587173581, "learning_rate": 4.349593495934959e-07, "loss": 0.0225, "num_tokens": 3321534.0, "reward": 0.40625, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7989542484283447, "sampling/importance_sampling_ratio/mean": 0.9994832277297974, "sampling/importance_sampling_ratio/min": 0.027374885976314545, "sampling/sampling_logp_difference/max": 3.5981292724609375, "sampling/sampling_logp_difference/mean": 0.013361322693526745, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 256.609375, "completions/mean_terminated_length": 256.609375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.20587630569934845, "epoch": 0.13357843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.5257994074267291, "kl": 0.0031718644313514233, "learning_rate": 4.390243902439024e-07, "loss": 0.0223, "num_tokens": 3362229.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002754926681519, "sampling/importance_sampling_ratio/min": 0.011834507808089256, "sampling/sampling_logp_difference/max": 4.4367356300354, "sampling/sampling_logp_difference/mean": 0.01595362275838852, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 229.296875, "completions/mean_terminated_length": 229.296875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.15049073100090027, "epoch": 0.13480392156862744, "frac_reward_zero_std": 1.0, "grad_norm": 0.024043545684525217, "kl": 0.0024401461705565453, "learning_rate": 4.4308943089430896e-07, "loss": 0.0, "num_tokens": 3404056.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8615195751190186, "sampling/importance_sampling_ratio/mean": 0.9994591474533081, "sampling/importance_sampling_ratio/min": 0.1321420669555664, "sampling/sampling_logp_difference/max": 2.0238776206970215, "sampling/sampling_logp_difference/mean": 0.011990568600594997, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 140.734375, "completions/mean_terminated_length": 140.734375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.15258745849132538, "epoch": 0.13602941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.036371133195659794, "kl": 0.0029322488699108362, "learning_rate": 4.471544715447154e-07, "loss": 0.0, "num_tokens": 3427655.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6232942342758179, "sampling/importance_sampling_ratio/mean": 0.9999867081642151, "sampling/importance_sampling_ratio/min": 0.5510865449905396, "sampling/sampling_logp_difference/max": 0.5958633422851562, "sampling/sampling_logp_difference/mean": 0.012529904022812843, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 268.984375, "completions/mean_terminated_length": 268.984375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.24910208582878113, "epoch": 0.13725490196078433, "frac_reward_zero_std": 0.25, "grad_norm": 1.9207390396724677, "kl": 0.0038540945388376713, "learning_rate": 4.5121951219512194e-07, "loss": -0.1523, "num_tokens": 3460934.0, "reward": 0.25, "reward_std": 0.6813369989395142, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997692108154297, "sampling/importance_sampling_ratio/min": 0.22504088282585144, "sampling/sampling_logp_difference/max": 1.4914731979370117, "sampling/sampling_logp_difference/mean": 0.015616269782185555, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 185.921875, "completions/mean_terminated_length": 185.921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1516919881105423, "epoch": 0.13848039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 1.5625968120291285, "kl": 0.0030631846748292446, "learning_rate": 4.5528455284552844e-07, "loss": 0.0062, "num_tokens": 3494769.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002806186676025, "sampling/importance_sampling_ratio/min": 0.509614884853363, "sampling/sampling_logp_difference/max": 0.6986632347106934, "sampling/sampling_logp_difference/mean": 0.012877561151981354, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 175.953125, "completions/mean_terminated_length": 175.953125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.20463861525058746, "epoch": 0.13970588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 1.6959729876683496, "kl": 0.0027167543303221464, "learning_rate": 4.5934959349593493e-07, "loss": 0.0068, "num_tokens": 3523326.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8048852682113647, "sampling/importance_sampling_ratio/mean": 0.9999552369117737, "sampling/importance_sampling_ratio/min": 0.36874687671661377, "sampling/sampling_logp_difference/max": 0.9976449012756348, "sampling/sampling_logp_difference/mean": 0.015765059739351273, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 176.15625, "completions/mean_terminated_length": 176.15625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.11405276507139206, "epoch": 0.1409313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.028435623016476855, "kl": 0.002391491550952196, "learning_rate": 4.634146341463415e-07, "loss": 0.0, "num_tokens": 3550072.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.989517092704773, "sampling/importance_sampling_ratio/mean": 0.9999762177467346, "sampling/importance_sampling_ratio/min": 0.4801751971244812, "sampling/sampling_logp_difference/max": 0.7336042523384094, "sampling/sampling_logp_difference/mean": 0.0115530239418149, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 175.21875, "completions/mean_terminated_length": 175.21875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.18383879959583282, "epoch": 0.14215686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 1.926756574189135, "kl": 0.004470705054700375, "learning_rate": 4.674796747967479e-07, "loss": -0.0008, "num_tokens": 3584502.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006392002105713, "sampling/importance_sampling_ratio/min": 0.3582732081413269, "sampling/sampling_logp_difference/max": 1.1886632442474365, "sampling/sampling_logp_difference/mean": 0.01593351922929287, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 333.65625, "completions/mean_terminated_length": 333.65625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.13514362275600433, "epoch": 0.14338235294117646, "frac_reward_zero_std": 0.25, "grad_norm": 1.4031890143594556, "kl": 0.0020420460496097803, "learning_rate": 4.7154471544715447e-07, "loss": -0.0866, "num_tokens": 3624608.0, "reward": 0.0625, "reward_std": 0.617996096611023, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9954233169555664, "sampling/importance_sampling_ratio/mean": 1.0003677606582642, "sampling/importance_sampling_ratio/min": 0.4747653305530548, "sampling/sampling_logp_difference/max": 0.7449345588684082, "sampling/sampling_logp_difference/mean": 0.00980311818420887, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 246.296875, "completions/mean_terminated_length": 246.296875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1611090898513794, "epoch": 0.14460784313725492, "frac_reward_zero_std": 0.75, "grad_norm": 1.4060303837792323, "kl": 0.0024019847624003887, "learning_rate": 4.756097560975609e-07, "loss": -0.0066, "num_tokens": 3659251.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008327960968018, "sampling/importance_sampling_ratio/min": 0.2378438413143158, "sampling/sampling_logp_difference/max": 1.436141014099121, "sampling/sampling_logp_difference/mean": 0.012749040499329567, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 209.328125, "completions/mean_terminated_length": 209.328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2315022349357605, "epoch": 0.14583333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 2.9311765050475516, "kl": 0.004573897924274206, "learning_rate": 4.796747967479675e-07, "loss": -0.0414, "num_tokens": 3686840.0, "reward": -0.65625, "reward_std": 0.7366957664489746, "rewards/decision_reward_func/mean": -0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003242492675781, "sampling/importance_sampling_ratio/min": 0.4618137776851654, "sampling/sampling_logp_difference/max": 1.0040912628173828, "sampling/sampling_logp_difference/mean": 0.01711522415280342, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 193.453125, "completions/mean_terminated_length": 193.453125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.17183716595172882, "epoch": 0.14705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 2.128245508696429, "kl": 0.0038222374860197306, "learning_rate": 4.83739837398374e-07, "loss": 0.0195, "num_tokens": 3714709.0, "reward": 0.53125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004196166992188, "sampling/importance_sampling_ratio/min": 0.11990854144096375, "sampling/sampling_logp_difference/max": 2.121026039123535, "sampling/sampling_logp_difference/mean": 0.013609963469207287, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 242.46875, "completions/mean_terminated_length": 242.46875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2437196969985962, "epoch": 0.1482843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.4598078819366105, "kl": 0.002234832150861621, "learning_rate": 4.878048780487804e-07, "loss": -0.0034, "num_tokens": 3753411.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.7094210386276245, "sampling/importance_sampling_ratio/mean": 0.9994633197784424, "sampling/importance_sampling_ratio/min": 0.22669436037540436, "sampling/sampling_logp_difference/max": 1.4841525554656982, "sampling/sampling_logp_difference/mean": 0.017895739525556564, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 207.90625, "completions/mean_terminated_length": 207.90625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.20850908756256104, "epoch": 0.14950980392156862, "frac_reward_zero_std": 0.75, "grad_norm": 1.5981813416819735, "kl": 0.0032731422688812017, "learning_rate": 4.91869918699187e-07, "loss": -0.0103, "num_tokens": 3786877.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5795906782150269, "sampling/importance_sampling_ratio/mean": 0.999427080154419, "sampling/importance_sampling_ratio/min": 0.3412606716156006, "sampling/sampling_logp_difference/max": 1.0751086473464966, "sampling/sampling_logp_difference/mean": 0.014012198895215988, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 158.421875, "completions/mean_terminated_length": 158.421875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.1915990114212036, "epoch": 0.15073529411764705, "frac_reward_zero_std": 0.25, "grad_norm": 3.2374537936724637, "kl": 0.003779832972213626, "learning_rate": 4.959349593495934e-07, "loss": 0.0142, "num_tokens": 3810856.0, "reward": 0.5, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.86378812789917, "sampling/importance_sampling_ratio/mean": 0.9994109272956848, "sampling/importance_sampling_ratio/min": 0.459695428609848, "sampling/sampling_logp_difference/max": 0.777191162109375, "sampling/sampling_logp_difference/mean": 0.015889719128608704, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 324.296875, "completions/mean_terminated_length": 324.296875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.19484072923660278, "epoch": 0.15196078431372548, "frac_reward_zero_std": 0.25, "grad_norm": 1.837653411054766, "kl": 0.0026376841124147177, "learning_rate": 5e-07, "loss": -0.0623, "num_tokens": 3859771.0, "reward": 0.15625, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995371699333191, "sampling/importance_sampling_ratio/min": 0.3069584369659424, "sampling/sampling_logp_difference/max": 1.1810429096221924, "sampling/sampling_logp_difference/mean": 0.013854834251105785, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 165.09375, "completions/mean_terminated_length": 165.09375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.17323510348796844, "epoch": 0.15318627450980393, "frac_reward_zero_std": 0.25, "grad_norm": 2.5482149381120744, "kl": 0.0043769716285169125, "learning_rate": 5.040650406504064e-07, "loss": 0.0973, "num_tokens": 3889521.0, "reward": 0.46875, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.612404704093933, "sampling/importance_sampling_ratio/mean": 0.9999923706054688, "sampling/importance_sampling_ratio/min": 0.4086287319660187, "sampling/sampling_logp_difference/max": 0.8949482440948486, "sampling/sampling_logp_difference/mean": 0.013625801540911198, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 217.21875, "completions/mean_terminated_length": 217.21875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.24082908034324646, "epoch": 0.15441176470588236, "frac_reward_zero_std": 0.25, "grad_norm": 2.0962555888788272, "kl": 0.006179064512252808, "learning_rate": 5.081300813008131e-07, "loss": 0.0102, "num_tokens": 3920687.0, "reward": 0.53125, "reward_std": 0.6970869898796082, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999029040336609, "sampling/importance_sampling_ratio/min": 0.4000912308692932, "sampling/sampling_logp_difference/max": 0.9160627126693726, "sampling/sampling_logp_difference/mean": 0.01595935970544815, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 148.53125, "completions/mean_terminated_length": 148.53125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.16705353558063507, "epoch": 0.1556372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 2.3979375953032402, "kl": 0.004339561332017183, "learning_rate": 5.121951219512195e-07, "loss": -0.0035, "num_tokens": 3945777.0, "reward": -0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6513288021087646, "sampling/importance_sampling_ratio/mean": 0.999443769454956, "sampling/importance_sampling_ratio/min": 0.4581097364425659, "sampling/sampling_logp_difference/max": 0.780646562576294, "sampling/sampling_logp_difference/mean": 0.014140298590064049, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 148.28125, "completions/mean_terminated_length": 148.28125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.20434808731079102, "epoch": 0.1568627450980392, "frac_reward_zero_std": 0.25, "grad_norm": 3.053304404963926, "kl": 0.006051310338079929, "learning_rate": 5.16260162601626e-07, "loss": 0.0431, "num_tokens": 3971763.0, "reward": 0.5, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995803833007812, "sampling/importance_sampling_ratio/min": 0.2824687957763672, "sampling/sampling_logp_difference/max": 1.2641870975494385, "sampling/sampling_logp_difference/mean": 0.015509507618844509, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 259.71875, "completions/mean_terminated_length": 259.71875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.17522120475769043, "epoch": 0.15808823529411764, "frac_reward_zero_std": 0.25, "grad_norm": 2.1036992710076245, "kl": 0.0062559181824326515, "learning_rate": 5.203252032520325e-07, "loss": 0.0781, "num_tokens": 4006929.0, "reward": 0.5625, "reward_std": 0.6663130521774292, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00052809715271, "sampling/importance_sampling_ratio/min": 0.20834197103977203, "sampling/sampling_logp_difference/max": 1.5685744285583496, "sampling/sampling_logp_difference/mean": 0.013994507491588593, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 207.625, "completions/mean_terminated_length": 207.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.22118355333805084, "epoch": 0.15931372549019607, "frac_reward_zero_std": 0.25, "grad_norm": 2.5144738581732677, "kl": 0.006246750243008137, "learning_rate": 5.24390243902439e-07, "loss": -0.0558, "num_tokens": 4036873.0, "reward": 0.375, "reward_std": 0.6267197132110596, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000226497650146, "sampling/importance_sampling_ratio/min": 0.3516245484352112, "sampling/sampling_logp_difference/max": 1.0451912879943848, "sampling/sampling_logp_difference/mean": 0.014911655336618423, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.13152629137039185, "epoch": 0.16053921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.03046928461366396, "kl": 0.0037722119595855474, "learning_rate": 5.284552845528455e-07, "loss": 0.0, "num_tokens": 4065097.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000600814819336, "sampling/importance_sampling_ratio/min": 0.11419089883565903, "sampling/sampling_logp_difference/max": 2.1698837280273438, "sampling/sampling_logp_difference/mean": 0.011482913978397846, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.14755786955356598, "epoch": 0.16176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.03286335625323049, "kl": 0.004532915540039539, "learning_rate": 5.325203252032519e-07, "loss": 0.0, "num_tokens": 4094413.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997543096542358, "sampling/importance_sampling_ratio/min": 0.09504194557666779, "sampling/sampling_logp_difference/max": 2.3534369468688965, "sampling/sampling_logp_difference/mean": 0.01464562863111496, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 164.078125, "completions/mean_terminated_length": 164.078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1908288598060608, "epoch": 0.16299019607843138, "frac_reward_zero_std": 0.75, "grad_norm": 2.0801893591822433, "kl": 0.005190457217395306, "learning_rate": 5.365853658536586e-07, "loss": 0.0327, "num_tokens": 4124530.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999281167984009, "sampling/importance_sampling_ratio/min": 0.30315646529197693, "sampling/sampling_logp_difference/max": 1.3454761505126953, "sampling/sampling_logp_difference/mean": 0.018591245636343956, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 200.234375, "completions/mean_terminated_length": 200.234375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.20808929204940796, "epoch": 0.1642156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.2169722191860628, "kl": 0.009982358664274216, "learning_rate": 5.40650406504065e-07, "loss": -0.0116, "num_tokens": 4153425.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002822875976562, "sampling/importance_sampling_ratio/min": 0.5256777405738831, "sampling/sampling_logp_difference/max": 0.8757648468017578, "sampling/sampling_logp_difference/mean": 0.014309515245258808, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 214.40625, "completions/mean_terminated_length": 214.40625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2324899584054947, "epoch": 0.16544117647058823, "frac_reward_zero_std": 0.25, "grad_norm": 2.679416954022788, "kl": 0.012347174808382988, "learning_rate": 5.447154471544715e-07, "loss": -0.0228, "num_tokens": 4189371.0, "reward": 0.75, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000662803649902, "sampling/importance_sampling_ratio/min": 0.41598430275917053, "sampling/sampling_logp_difference/max": 0.8771077394485474, "sampling/sampling_logp_difference/mean": 0.01692269928753376, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 215.828125, "completions/mean_terminated_length": 215.828125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2709134817123413, "epoch": 0.16666666666666666, "frac_reward_zero_std": 0.25, "grad_norm": 2.2187073874644576, "kl": 0.007469979114830494, "learning_rate": 5.487804878048781e-07, "loss": -0.0014, "num_tokens": 4225440.0, "reward": 0.3125, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.7517609596252441, "sampling/importance_sampling_ratio/mean": 1.0001893043518066, "sampling/importance_sampling_ratio/min": 0.4335139989852905, "sampling/sampling_logp_difference/max": 0.8358311653137207, "sampling/sampling_logp_difference/mean": 0.018575577065348625, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 219.53125, "completions/mean_terminated_length": 219.53125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.18905353546142578, "epoch": 0.16789215686274508, "frac_reward_zero_std": 0.75, "grad_norm": 1.5848390274616682, "kl": 0.0062895650044083595, "learning_rate": 5.528455284552846e-07, "loss": 0.0301, "num_tokens": 4256034.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.7029181718826294, "sampling/importance_sampling_ratio/mean": 1.0001823902130127, "sampling/importance_sampling_ratio/min": 0.5002944469451904, "sampling/sampling_logp_difference/max": 0.6925585269927979, "sampling/sampling_logp_difference/mean": 0.012864059768617153, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.21458560228347778, "epoch": 0.16911764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 1.8899681943241897, "kl": 0.007482388522475958, "learning_rate": 5.56910569105691e-07, "loss": 0.0397, "num_tokens": 4285738.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9538545608520508, "sampling/importance_sampling_ratio/mean": 1.0000500679016113, "sampling/importance_sampling_ratio/min": 0.2822778820991516, "sampling/sampling_logp_difference/max": 1.2648632526397705, "sampling/sampling_logp_difference/mean": 0.01581709086894989, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 211.796875, "completions/mean_terminated_length": 211.796875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2663291096687317, "epoch": 0.17034313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 1.811200263563688, "kl": 0.006997620686888695, "learning_rate": 5.609756097560975e-07, "loss": 0.0245, "num_tokens": 4316541.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.7874058485031128, "sampling/importance_sampling_ratio/mean": 1.0008964538574219, "sampling/importance_sampling_ratio/min": 0.49483078718185425, "sampling/sampling_logp_difference/max": 0.7035393714904785, "sampling/sampling_logp_difference/mean": 0.017890911549329758, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 205.03125, "completions/mean_terminated_length": 205.03125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.20182162523269653, "epoch": 0.1715686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.034780777827098576, "kl": 0.004200125113129616, "learning_rate": 5.650406504065041e-07, "loss": 0.0, "num_tokens": 4349471.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000801682472229, "sampling/importance_sampling_ratio/min": 0.46579307317733765, "sampling/sampling_logp_difference/max": 0.8023383617401123, "sampling/sampling_logp_difference/mean": 0.015425757504999638, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 155.640625, "completions/mean_terminated_length": 155.640625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.15710008144378662, "epoch": 0.17279411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 2.2324288770488674, "kl": 0.009632674977183342, "learning_rate": 5.691056910569105e-07, "loss": 0.0051, "num_tokens": 4374408.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004931688308716, "sampling/importance_sampling_ratio/min": 0.37433290481567383, "sampling/sampling_logp_difference/max": 0.982609748840332, "sampling/sampling_logp_difference/mean": 0.012852296233177185, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 168.21875, "completions/mean_terminated_length": 168.21875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.18867620825767517, "epoch": 0.17401960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 1.3049256000371254, "kl": 0.006957865320146084, "learning_rate": 5.73170731707317e-07, "loss": 0.0155, "num_tokens": 4402886.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9916741847991943, "sampling/importance_sampling_ratio/mean": 1.0004972219467163, "sampling/importance_sampling_ratio/min": 0.36625033617019653, "sampling/sampling_logp_difference/max": 1.0044382810592651, "sampling/sampling_logp_difference/mean": 0.01673087105154991, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 158.5625, "completions/mean_terminated_length": 158.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.19338209927082062, "epoch": 0.17524509803921567, "frac_reward_zero_std": 0.5, "grad_norm": 2.986566813860805, "kl": 0.006531933322548866, "learning_rate": 5.772357723577236e-07, "loss": 0.0225, "num_tokens": 4427706.0, "reward": 0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997323751449585, "sampling/importance_sampling_ratio/min": 0.38433632254600525, "sampling/sampling_logp_difference/max": 0.9562373161315918, "sampling/sampling_logp_difference/mean": 0.01588144153356552, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 161.203125, "completions/mean_terminated_length": 161.203125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.27450457215309143, "epoch": 0.17647058823529413, "frac_reward_zero_std": 0.25, "grad_norm": 3.059225107853198, "kl": 0.014421003870666027, "learning_rate": 5.813008130081301e-07, "loss": 0.0124, "num_tokens": 4465031.0, "reward": 0.21875, "reward_std": 0.747555673122406, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994995594024658, "sampling/importance_sampling_ratio/min": 0.27133044600486755, "sampling/sampling_logp_difference/max": 1.3044178485870361, "sampling/sampling_logp_difference/mean": 0.02083595097064972, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 144.671875, "completions/mean_terminated_length": 144.671875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.18501253426074982, "epoch": 0.17769607843137256, "frac_reward_zero_std": 0.75, "grad_norm": 1.8337965737119062, "kl": 0.008115117438137531, "learning_rate": 5.853658536585365e-07, "loss": -0.0146, "num_tokens": 4491730.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.0005688667297363, "sampling/importance_sampling_ratio/min": 0.4150790870189667, "sampling/sampling_logp_difference/max": 0.8792862892150879, "sampling/sampling_logp_difference/mean": 0.015096020884811878, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 206.53125, "completions/mean_terminated_length": 206.53125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.19073179364204407, "epoch": 0.17892156862745098, "frac_reward_zero_std": 0.25, "grad_norm": 2.3074569787892356, "kl": 0.009971046820282936, "learning_rate": 5.894308943089431e-07, "loss": 0.0561, "num_tokens": 4527908.0, "reward": 0.625, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000136137008667, "sampling/importance_sampling_ratio/min": 0.4788450598716736, "sampling/sampling_logp_difference/max": 0.807410478591919, "sampling/sampling_logp_difference/mean": 0.014115612953901291, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 208.4375, "completions/mean_terminated_length": 208.4375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.19065698981285095, "epoch": 0.1801470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 2.1376428733901416, "kl": 0.003937162458896637, "learning_rate": 5.934959349593496e-07, "loss": -0.0116, "num_tokens": 4556816.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5901724100112915, "sampling/importance_sampling_ratio/mean": 0.9997545480728149, "sampling/importance_sampling_ratio/min": 0.08214754611253738, "sampling/sampling_logp_difference/max": 2.4992382526397705, "sampling/sampling_logp_difference/mean": 0.012619711458683014, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 207.234375, "completions/mean_terminated_length": 207.234375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.23524893820285797, "epoch": 0.18137254901960784, "frac_reward_zero_std": 0.25, "grad_norm": 3.2215328161770977, "kl": 0.00898287259042263, "learning_rate": 5.97560975609756e-07, "loss": 0.0794, "num_tokens": 4588735.0, "reward": 0.59375, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.9876911640167236, "sampling/importance_sampling_ratio/mean": 1.0000414848327637, "sampling/importance_sampling_ratio/min": 0.3859356641769409, "sampling/sampling_logp_difference/max": 0.9520845413208008, "sampling/sampling_logp_difference/mean": 0.018174968659877777, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 146.453125, "completions/mean_terminated_length": 146.453125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.1839636117219925, "epoch": 0.18259803921568626, "frac_reward_zero_std": 0.5, "grad_norm": 2.870807509522527, "kl": 0.01568181812763214, "learning_rate": 6.016260162601626e-07, "loss": 0.0299, "num_tokens": 4617836.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007880926132202, "sampling/importance_sampling_ratio/min": 0.3016190528869629, "sampling/sampling_logp_difference/max": 1.1985905170440674, "sampling/sampling_logp_difference/mean": 0.01622135564684868, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2160109132528305, "epoch": 0.18382352941176472, "frac_reward_zero_std": 0.5, "grad_norm": 2.1448145492871644, "kl": 0.006461469456553459, "learning_rate": 6.056910569105691e-07, "loss": -0.007, "num_tokens": 4645160.0, "reward": 0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000572204589844, "sampling/importance_sampling_ratio/min": 0.23663082718849182, "sampling/sampling_logp_difference/max": 1.4412540197372437, "sampling/sampling_logp_difference/mean": 0.01504783146083355, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 192.171875, "completions/mean_terminated_length": 192.171875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.28349563479423523, "epoch": 0.18504901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.8813112982324198, "kl": 0.008609576150774956, "learning_rate": 6.097560975609756e-07, "loss": -0.0038, "num_tokens": 4679859.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.7709717750549316, "sampling/importance_sampling_ratio/mean": 1.0000584125518799, "sampling/importance_sampling_ratio/min": 0.5407850742340088, "sampling/sampling_logp_difference/max": 0.6147333383560181, "sampling/sampling_logp_difference/mean": 0.01752694509923458, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 178.296875, "completions/mean_terminated_length": 178.296875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1929841935634613, "epoch": 0.18627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.285064208466724, "kl": 0.009654335677623749, "learning_rate": 6.13821138211382e-07, "loss": 0.0015, "num_tokens": 4711766.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000225305557251, "sampling/importance_sampling_ratio/min": 0.5676493048667908, "sampling/sampling_logp_difference/max": 0.7288963794708252, "sampling/sampling_logp_difference/mean": 0.014351408928632736, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 159.8125, "completions/mean_terminated_length": 159.8125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.18735501170158386, "epoch": 0.1875, "frac_reward_zero_std": 0.75, "grad_norm": 1.6921133600368123, "kl": 0.010966267436742783, "learning_rate": 6.178861788617887e-07, "loss": -0.0191, "num_tokens": 4738634.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.751664400100708, "sampling/importance_sampling_ratio/mean": 1.0007822513580322, "sampling/importance_sampling_ratio/min": 0.3664078414440155, "sampling/sampling_logp_difference/max": 1.0040082931518555, "sampling/sampling_logp_difference/mean": 0.014257569797337055, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 215.921875, "completions/mean_terminated_length": 215.921875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2175692319869995, "epoch": 0.18872549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 2.0600521345041236, "kl": 0.008125945925712585, "learning_rate": 6.219512195121951e-07, "loss": -0.0697, "num_tokens": 4772085.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993202090263367, "sampling/importance_sampling_ratio/min": 0.23123972117900848, "sampling/sampling_logp_difference/max": 1.4643003940582275, "sampling/sampling_logp_difference/mean": 0.01601422019302845, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.21947787702083588, "epoch": 0.18995098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 2.5007272914259344, "kl": 0.014036942273378372, "learning_rate": 6.260162601626016e-07, "loss": 0.0502, "num_tokens": 4798853.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994269609451294, "sampling/importance_sampling_ratio/min": 0.48846733570098877, "sampling/sampling_logp_difference/max": 0.9443447589874268, "sampling/sampling_logp_difference/mean": 0.016575731337070465, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 164.03125, "completions/mean_terminated_length": 164.03125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.20363306999206543, "epoch": 0.19117647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.9863150545331905, "kl": 0.01705070771276951, "learning_rate": 6.300813008130081e-07, "loss": -0.0159, "num_tokens": 4826183.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000278949737549, "sampling/importance_sampling_ratio/min": 0.2822778820991516, "sampling/sampling_logp_difference/max": 1.2648632526397705, "sampling/sampling_logp_difference/mean": 0.014696292579174042, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 150.5625, "completions/mean_terminated_length": 150.5625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.16793668270111084, "epoch": 0.19240196078431374, "frac_reward_zero_std": 0.75, "grad_norm": 1.8347136815339293, "kl": 0.013624644838273525, "learning_rate": 6.341463414634146e-07, "loss": -0.001, "num_tokens": 4856155.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000171661376953, "sampling/importance_sampling_ratio/min": 0.3773249685764313, "sampling/sampling_logp_difference/max": 0.9746484756469727, "sampling/sampling_logp_difference/mean": 0.015161644667387009, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 164.640625, "completions/mean_terminated_length": 164.640625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.19261035323143005, "epoch": 0.19362745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.0138578712957336, "kl": 0.0075540426187217236, "learning_rate": 6.382113821138211e-07, "loss": 0.0021, "num_tokens": 4881396.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.9841989278793335, "sampling/importance_sampling_ratio/mean": 1.0002670288085938, "sampling/importance_sampling_ratio/min": 0.5383151769638062, "sampling/sampling_logp_difference/max": 0.6852152347564697, "sampling/sampling_logp_difference/mean": 0.014371512457728386, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 181.703125, "completions/mean_terminated_length": 181.703125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.19912806153297424, "epoch": 0.1948529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 2.316865502848925, "kl": 0.006664983928203583, "learning_rate": 6.422764227642276e-07, "loss": 0.0183, "num_tokens": 4908721.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8389919996261597, "sampling/importance_sampling_ratio/mean": 0.9989259839057922, "sampling/importance_sampling_ratio/min": 0.21832527220249176, "sampling/sampling_logp_difference/max": 1.5217692852020264, "sampling/sampling_logp_difference/mean": 0.01603008806705475, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 176.8125, "completions/mean_terminated_length": 176.8125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.23629969358444214, "epoch": 0.19607843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.8826280925730605, "kl": 0.011435983702540398, "learning_rate": 6.463414634146342e-07, "loss": -0.0818, "num_tokens": 4939669.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8665122985839844, "sampling/importance_sampling_ratio/mean": 0.9996390342712402, "sampling/importance_sampling_ratio/min": 0.4956026077270508, "sampling/sampling_logp_difference/max": 0.7019808292388916, "sampling/sampling_logp_difference/mean": 0.016076810657978058, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 181.765625, "completions/mean_terminated_length": 181.765625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.1463281810283661, "epoch": 0.19730392156862744, "frac_reward_zero_std": 1.0, "grad_norm": 0.028215489525451012, "kl": 0.004451300948858261, "learning_rate": 6.504065040650406e-07, "loss": 0.0, "num_tokens": 4968406.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001604557037354, "sampling/importance_sampling_ratio/min": 0.07517139613628387, "sampling/sampling_logp_difference/max": 2.587984561920166, "sampling/sampling_logp_difference/mean": 0.01525675505399704, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 154.46875, "completions/mean_terminated_length": 154.46875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2354339212179184, "epoch": 0.19852941176470587, "frac_reward_zero_std": 0.5, "grad_norm": 2.407889101784422, "kl": 0.014467542991042137, "learning_rate": 6.544715447154471e-07, "loss": -0.0004, "num_tokens": 4997012.0, "reward": 0.21875, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5366731882095337, "sampling/importance_sampling_ratio/mean": 0.9997107982635498, "sampling/importance_sampling_ratio/min": 0.387630432844162, "sampling/sampling_logp_difference/max": 0.9477028846740723, "sampling/sampling_logp_difference/mean": 0.016185423359274864, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 207.015625, "completions/mean_terminated_length": 207.015625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.1951788365840912, "epoch": 0.19975490196078433, "frac_reward_zero_std": 1.0, "grad_norm": 0.044195899524379294, "kl": 0.008431365713477135, "learning_rate": 6.585365853658536e-07, "loss": 0.0001, "num_tokens": 5027269.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8064712285995483, "sampling/importance_sampling_ratio/mean": 1.0001356601715088, "sampling/importance_sampling_ratio/min": 0.42160293459892273, "sampling/sampling_logp_difference/max": 0.8636913299560547, "sampling/sampling_logp_difference/mean": 0.014173239469528198, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 192.015625, "completions/mean_terminated_length": 192.015625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.1966334581375122, "epoch": 0.20098039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 1.5490323583576546, "kl": 0.011206312105059624, "learning_rate": 6.626016260162602e-07, "loss": -0.0075, "num_tokens": 5072278.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003595352172852, "sampling/importance_sampling_ratio/min": 0.14512112736701965, "sampling/sampling_logp_difference/max": 1.9301865100860596, "sampling/sampling_logp_difference/mean": 0.015656569972634315, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 162.21875, "completions/mean_terminated_length": 162.21875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2012788951396942, "epoch": 0.20220588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.9594882642866283, "kl": 0.009323729202151299, "learning_rate": 6.666666666666666e-07, "loss": 0.0417, "num_tokens": 5100372.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6112180948257446, "sampling/importance_sampling_ratio/mean": 0.9993187189102173, "sampling/importance_sampling_ratio/min": 0.49222439527511597, "sampling/sampling_logp_difference/max": 0.7088205814361572, "sampling/sampling_logp_difference/mean": 0.012670625001192093, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 127.234375, "completions/mean_terminated_length": 127.234375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.18998035788536072, "epoch": 0.2034313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 2.1797690247211605, "kl": 0.010904636234045029, "learning_rate": 6.707317073170731e-07, "loss": 0.0096, "num_tokens": 5123987.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8535860776901245, "sampling/importance_sampling_ratio/mean": 0.9999647736549377, "sampling/importance_sampling_ratio/min": 0.4310276508331299, "sampling/sampling_logp_difference/max": 0.8415830135345459, "sampling/sampling_logp_difference/mean": 0.015727341175079346, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 170.921875, "completions/mean_terminated_length": 170.921875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.21414750814437866, "epoch": 0.20465686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.03840811104929106, "kl": 0.007046514190733433, "learning_rate": 6.747967479674797e-07, "loss": 0.0001, "num_tokens": 5152254.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7908974885940552, "sampling/importance_sampling_ratio/mean": 0.9999610781669617, "sampling/importance_sampling_ratio/min": 0.546879768371582, "sampling/sampling_logp_difference/max": 0.60352623462677, "sampling/sampling_logp_difference/mean": 0.01575840264558792, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.24589388072490692, "epoch": 0.20588235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 2.3035277481025394, "kl": 0.009675178676843643, "learning_rate": 6.788617886178861e-07, "loss": -0.0053, "num_tokens": 5181406.0, "reward": 0.34375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9536131620407104, "sampling/importance_sampling_ratio/mean": 0.9992693662643433, "sampling/importance_sampling_ratio/min": 0.26408031582832336, "sampling/sampling_logp_difference/max": 1.3315019607543945, "sampling/sampling_logp_difference/mean": 0.01715805009007454, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 181.9375, "completions/mean_terminated_length": 181.9375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.24018113315105438, "epoch": 0.20710784313725492, "frac_reward_zero_std": 0.75, "grad_norm": 1.5100028986553524, "kl": 0.0078086634166538715, "learning_rate": 6.829268292682927e-07, "loss": 0.0068, "num_tokens": 5210938.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7270176410675049, "sampling/importance_sampling_ratio/mean": 1.000520944595337, "sampling/importance_sampling_ratio/min": 0.36654388904571533, "sampling/sampling_logp_difference/max": 1.0036370754241943, "sampling/sampling_logp_difference/mean": 0.01715228334069252, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.24454890191555023, "epoch": 0.20833333333333334, "frac_reward_zero_std": 0.5, "grad_norm": 2.099218301218686, "kl": 0.007833386771380901, "learning_rate": 6.869918699186991e-07, "loss": 0.0956, "num_tokens": 5244466.0, "reward": 0.46875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994741678237915, "sampling/importance_sampling_ratio/min": 0.3991053104400635, "sampling/sampling_logp_difference/max": 0.9185299873352051, "sampling/sampling_logp_difference/mean": 0.01967196725308895, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 142.53125, "completions/mean_terminated_length": 142.53125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.22392506897449493, "epoch": 0.20955882352941177, "frac_reward_zero_std": 0.75, "grad_norm": 3.2958306867135656, "kl": 0.008253801614046097, "learning_rate": 6.910569105691057e-07, "loss": 0.0075, "num_tokens": 5271508.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6007570028305054, "sampling/importance_sampling_ratio/mean": 1.000047206878662, "sampling/importance_sampling_ratio/min": 0.5157365202903748, "sampling/sampling_logp_difference/max": 0.6621593236923218, "sampling/sampling_logp_difference/mean": 0.015675466507673264, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 200.453125, "completions/mean_terminated_length": 200.453125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.24501550197601318, "epoch": 0.2107843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.9957968776660147, "kl": 0.00728217139840126, "learning_rate": 6.951219512195121e-07, "loss": 0.0107, "num_tokens": 5305713.0, "reward": 0.5625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8588887453079224, "sampling/importance_sampling_ratio/mean": 1.000617265701294, "sampling/importance_sampling_ratio/min": 0.37776103615760803, "sampling/sampling_logp_difference/max": 0.9734934568405151, "sampling/sampling_logp_difference/mean": 0.016320761293172836, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 151.09375, "completions/mean_terminated_length": 151.09375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.16363389790058136, "epoch": 0.21200980392156862, "frac_reward_zero_std": 1.0, "grad_norm": 0.06250278516304968, "kl": 0.007543622981756926, "learning_rate": 6.991869918699187e-07, "loss": 0.0001, "num_tokens": 5333559.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001720190048218, "sampling/importance_sampling_ratio/min": 0.49989962577819824, "sampling/sampling_logp_difference/max": 0.7407248020172119, "sampling/sampling_logp_difference/mean": 0.012731247581541538, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 135.21875, "completions/mean_terminated_length": 135.21875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2280770242214203, "epoch": 0.21323529411764705, "frac_reward_zero_std": 0.25, "grad_norm": 3.4754516499699335, "kl": 0.01365518569946289, "learning_rate": 7.032520325203252e-07, "loss": 0.0067, "num_tokens": 5360981.0, "reward": 0.28125, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999151229858398, "sampling/importance_sampling_ratio/min": 0.41998329758644104, "sampling/sampling_logp_difference/max": 0.8675403594970703, "sampling/sampling_logp_difference/mean": 0.017936518415808678, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 192.390625, "completions/mean_terminated_length": 192.390625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.21126362681388855, "epoch": 0.21446078431372548, "frac_reward_zero_std": 0.25, "grad_norm": 2.480997335273884, "kl": 0.009101223200559616, "learning_rate": 7.073170731707316e-07, "loss": -0.0795, "num_tokens": 5394542.0, "reward": 0.3125, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9185150861740112, "sampling/importance_sampling_ratio/mean": 0.9991146326065063, "sampling/importance_sampling_ratio/min": 0.549663782119751, "sampling/sampling_logp_difference/max": 0.6515514850616455, "sampling/sampling_logp_difference/mean": 0.014745143242180347, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 146.90625, "completions/mean_terminated_length": 146.90625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2415069192647934, "epoch": 0.21568627450980393, "frac_reward_zero_std": 0.5, "grad_norm": 2.4862652865416726, "kl": 0.013575580902397633, "learning_rate": 7.113821138211382e-07, "loss": -0.099, "num_tokens": 5426520.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9987435340881348, "sampling/importance_sampling_ratio/mean": 0.9995778799057007, "sampling/importance_sampling_ratio/min": 0.38637208938598633, "sampling/sampling_logp_difference/max": 0.9509544372558594, "sampling/sampling_logp_difference/mean": 0.018081510439515114, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 162.015625, "completions/mean_terminated_length": 162.015625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.18460117280483246, "epoch": 0.21691176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 2.583307746648768, "kl": 0.00942918099462986, "learning_rate": 7.154471544715447e-07, "loss": 0.0146, "num_tokens": 5459801.0, "reward": -0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9393787384033203, "sampling/importance_sampling_ratio/mean": 1.0000228881835938, "sampling/importance_sampling_ratio/min": 0.3568578064441681, "sampling/sampling_logp_difference/max": 1.0304179191589355, "sampling/sampling_logp_difference/mean": 0.015044205822050571, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 163.578125, "completions/mean_terminated_length": 163.578125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.15562379360198975, "epoch": 0.2181372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 14.920599056075442, "kl": 0.007000989280641079, "learning_rate": 7.195121951219512e-07, "loss": -0.1402, "num_tokens": 5490590.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997345209121704, "sampling/importance_sampling_ratio/min": 0.37634479999542236, "sampling/sampling_logp_difference/max": 0.9772495627403259, "sampling/sampling_logp_difference/mean": 0.01211632415652275, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 143.28125, "completions/mean_terminated_length": 143.28125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.17127113044261932, "epoch": 0.2193627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 3.207770064939479, "kl": 0.014192151837050915, "learning_rate": 7.235772357723577e-07, "loss": -0.1414, "num_tokens": 5525120.0, "reward": -0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003180503845215, "sampling/importance_sampling_ratio/min": 0.3161073923110962, "sampling/sampling_logp_difference/max": 1.1516733169555664, "sampling/sampling_logp_difference/mean": 0.0200350321829319, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 145.21875, "completions/mean_terminated_length": 145.21875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.18500688672065735, "epoch": 0.22058823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 1.7760482190945632, "kl": 0.010376361198723316, "learning_rate": 7.276422764227642e-07, "loss": -0.0076, "num_tokens": 5554158.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9286831617355347, "sampling/importance_sampling_ratio/mean": 1.0003862380981445, "sampling/importance_sampling_ratio/min": 0.3744829595088959, "sampling/sampling_logp_difference/max": 0.9822089672088623, "sampling/sampling_logp_difference/mean": 0.01596665009856224, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 163.734375, "completions/mean_terminated_length": 163.734375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.16916601359844208, "epoch": 0.22181372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 2.1885559148855194, "kl": 0.014154801145195961, "learning_rate": 7.317073170731707e-07, "loss": 0.0202, "num_tokens": 5580589.0, "reward": 0.21875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.9891959428787231, "sampling/importance_sampling_ratio/mean": 0.9989780783653259, "sampling/importance_sampling_ratio/min": 0.47438451647758484, "sampling/sampling_logp_difference/max": 0.7457370758056641, "sampling/sampling_logp_difference/mean": 0.013993790373206139, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 135.75, "completions/mean_terminated_length": 135.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.17388153076171875, "epoch": 0.22303921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.9196768856258746, "kl": 0.009921906515955925, "learning_rate": 7.357723577235772e-07, "loss": 0.0211, "num_tokens": 5611837.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.752700686454773, "sampling/importance_sampling_ratio/mean": 0.9996520280838013, "sampling/importance_sampling_ratio/min": 0.5270344018936157, "sampling/sampling_logp_difference/max": 0.6404894590377808, "sampling/sampling_logp_difference/mean": 0.013768360950052738, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 195.03125, "completions/mean_terminated_length": 195.03125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.16102421283721924, "epoch": 0.22426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0247183771509286, "kl": 0.006647592410445213, "learning_rate": 7.398373983739837e-07, "loss": 0.0001, "num_tokens": 5644767.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.998921275138855, "sampling/importance_sampling_ratio/min": 0.24369849264621735, "sampling/sampling_logp_difference/max": 1.4118235111236572, "sampling/sampling_logp_difference/mean": 0.014862995594739914, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 136.203125, "completions/mean_terminated_length": 136.203125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.2119152545928955, "epoch": 0.22549019607843138, "frac_reward_zero_std": 0.5, "grad_norm": 2.434777201904055, "kl": 0.01427440531551838, "learning_rate": 7.439024390243903e-07, "loss": 0.0337, "num_tokens": 5668364.0, "reward": -0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006811618804932, "sampling/importance_sampling_ratio/min": 0.4294593632221222, "sampling/sampling_logp_difference/max": 0.9697303771972656, "sampling/sampling_logp_difference/mean": 0.01680961810052395, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 147.0625, "completions/mean_terminated_length": 147.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20346541702747345, "epoch": 0.2267156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.5941225362087368, "kl": 0.0125267980620265, "learning_rate": 7.479674796747967e-07, "loss": -0.0058, "num_tokens": 5695232.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.9195092916488647, "sampling/importance_sampling_ratio/mean": 1.0003814697265625, "sampling/importance_sampling_ratio/min": 0.4757547974586487, "sampling/sampling_logp_difference/max": 0.7428526878356934, "sampling/sampling_logp_difference/mean": 0.01674540340900421, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 162.796875, "completions/mean_terminated_length": 162.796875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.15275320410728455, "epoch": 0.22794117647058823, "frac_reward_zero_std": 0.5, "grad_norm": 7.866930979749354, "kl": 0.12918700277805328, "learning_rate": 7.520325203252032e-07, "loss": 0.0253, "num_tokens": 5726259.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997475147247314, "sampling/importance_sampling_ratio/min": 0.001375882071442902, "sampling/sampling_logp_difference/max": 6.58866024017334, "sampling/sampling_logp_difference/mean": 0.013679513707756996, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 217.859375, "completions/mean_terminated_length": 217.859375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.25289398431777954, "epoch": 0.22916666666666666, "frac_reward_zero_std": 0.5, "grad_norm": 2.257704337020236, "kl": 0.008466934785246849, "learning_rate": 7.560975609756097e-07, "loss": 0.0071, "num_tokens": 5758778.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.8894721269607544, "sampling/importance_sampling_ratio/mean": 0.9993259906768799, "sampling/importance_sampling_ratio/min": 0.29954442381858826, "sampling/sampling_logp_difference/max": 1.2054924964904785, "sampling/sampling_logp_difference/mean": 0.017889931797981262, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 170.09375, "completions/mean_terminated_length": 170.09375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.23489050567150116, "epoch": 0.23039215686274508, "frac_reward_zero_std": 0.25, "grad_norm": 3.3651801574179805, "kl": 0.0169069841504097, "learning_rate": 7.601626016260162e-07, "loss": 0.0554, "num_tokens": 5796736.0, "reward": -0.0625, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996923208236694, "sampling/importance_sampling_ratio/min": 0.3697149455547333, "sampling/sampling_logp_difference/max": 1.2095823287963867, "sampling/sampling_logp_difference/mean": 0.018920443952083588, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 151.546875, "completions/mean_terminated_length": 151.546875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.19983212649822235, "epoch": 0.23161764705882354, "frac_reward_zero_std": 0.75, "grad_norm": 2.7491469653398797, "kl": 0.018551960587501526, "learning_rate": 7.642276422764228e-07, "loss": 0.0006, "num_tokens": 5822291.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9441649913787842, "sampling/importance_sampling_ratio/mean": 1.0001943111419678, "sampling/importance_sampling_ratio/min": 0.3973112106323242, "sampling/sampling_logp_difference/max": 0.9230353832244873, "sampling/sampling_logp_difference/mean": 0.015498116612434387, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 122.140625, "completions/mean_terminated_length": 122.140625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.17167788743972778, "epoch": 0.23284313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 2.0839651393069176, "kl": 0.015019101090729237, "learning_rate": 7.682926829268292e-07, "loss": -0.0065, "num_tokens": 5845404.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.9676556587219238, "sampling/importance_sampling_ratio/mean": 1.0010113716125488, "sampling/importance_sampling_ratio/min": 0.4645615816116333, "sampling/sampling_logp_difference/max": 0.7666611671447754, "sampling/sampling_logp_difference/mean": 0.014353152364492416, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 152.03125, "completions/mean_terminated_length": 152.03125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.21971988677978516, "epoch": 0.2340686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 3.086321636924277, "kl": 0.011947219260036945, "learning_rate": 7.723577235772358e-07, "loss": 0.0184, "num_tokens": 5877198.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994181394577026, "sampling/importance_sampling_ratio/min": 0.38398805260658264, "sampling/sampling_logp_difference/max": 1.4673542976379395, "sampling/sampling_logp_difference/mean": 0.017348386347293854, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 163.015625, "completions/mean_terminated_length": 163.015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.20136642456054688, "epoch": 0.23529411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 1.7120380707996077, "kl": 0.008222850039601326, "learning_rate": 7.764227642276422e-07, "loss": 0.0271, "num_tokens": 5902815.0, "reward": -0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004281997680664, "sampling/importance_sampling_ratio/min": 0.3861261010169983, "sampling/sampling_logp_difference/max": 0.9515912532806396, "sampling/sampling_logp_difference/mean": 0.01523653045296669, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 272.65625, "completions/mean_terminated_length": 272.65625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.20873147249221802, "epoch": 0.23651960784313725, "frac_reward_zero_std": 0.5, "grad_norm": 1.466968744792855, "kl": 0.007534465286880732, "learning_rate": 7.804878048780488e-07, "loss": -0.0454, "num_tokens": 5945145.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992533922195435, "sampling/importance_sampling_ratio/min": 0.13629940152168274, "sampling/sampling_logp_difference/max": 1.99290132522583, "sampling/sampling_logp_difference/mean": 0.014972115866839886, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 270.40625, "completions/mean_terminated_length": 270.40625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.18233969807624817, "epoch": 0.23774509803921567, "frac_reward_zero_std": 0.75, "grad_norm": 0.9686086293141357, "kl": 0.008675255812704563, "learning_rate": 7.845528455284552e-07, "loss": 0.0009, "num_tokens": 5978467.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998888373374939, "sampling/importance_sampling_ratio/min": 0.28685885667800903, "sampling/sampling_logp_difference/max": 1.248764991760254, "sampling/sampling_logp_difference/mean": 0.013606593944132328, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 218.828125, "completions/mean_terminated_length": 218.828125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.1996522843837738, "epoch": 0.23897058823529413, "frac_reward_zero_std": 0.5, "grad_norm": 1.6731080287121975, "kl": 0.014219471253454685, "learning_rate": 7.886178861788617e-07, "loss": -0.0101, "num_tokens": 6011416.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991976022720337, "sampling/importance_sampling_ratio/min": 0.33685335516929626, "sampling/sampling_logp_difference/max": 1.10807204246521, "sampling/sampling_logp_difference/mean": 0.01521037332713604, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 155.84375, "completions/mean_terminated_length": 155.84375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.13060662150382996, "epoch": 0.24019607843137256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0643870022708985, "kl": 0.008615046739578247, "learning_rate": 7.926829268292683e-07, "loss": 0.0001, "num_tokens": 6037774.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991104602813721, "sampling/importance_sampling_ratio/min": 0.2513620853424072, "sampling/sampling_logp_difference/max": 1.3808608055114746, "sampling/sampling_logp_difference/mean": 0.0147407790645957, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 161.890625, "completions/mean_terminated_length": 161.890625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.23081961274147034, "epoch": 0.24142156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 2.3276566035807305, "kl": 0.018241455778479576, "learning_rate": 7.967479674796747e-07, "loss": 0.0068, "num_tokens": 6067831.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001935958862305, "sampling/importance_sampling_ratio/min": 0.5692372918128967, "sampling/sampling_logp_difference/max": 0.9607486724853516, "sampling/sampling_logp_difference/mean": 0.016515467315912247, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 193.53125, "completions/mean_terminated_length": 193.53125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.20671209692955017, "epoch": 0.2426470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.9772604678970633, "kl": 0.012380264699459076, "learning_rate": 8.008130081300813e-07, "loss": 0.0614, "num_tokens": 6093689.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8159511089324951, "sampling/importance_sampling_ratio/mean": 1.0001795291900635, "sampling/importance_sampling_ratio/min": 0.5512356162071228, "sampling/sampling_logp_difference/max": 0.596609354019165, "sampling/sampling_logp_difference/mean": 0.014003811404109001, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 172.53125, "completions/mean_terminated_length": 172.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18476727604866028, "epoch": 0.24387254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.9137306340793396, "kl": 0.011842640116810799, "learning_rate": 8.048780487804878e-07, "loss": 0.0917, "num_tokens": 6123595.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7520594596862793, "sampling/importance_sampling_ratio/mean": 1.0005278587341309, "sampling/importance_sampling_ratio/min": 0.416149377822876, "sampling/sampling_logp_difference/max": 0.8767110109329224, "sampling/sampling_logp_difference/mean": 0.014176880940794945, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 224.5625, "completions/mean_terminated_length": 224.5625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.21131455898284912, "epoch": 0.24509803921568626, "frac_reward_zero_std": 0.75, "grad_norm": 1.1316313504215345, "kl": 0.01181216724216938, "learning_rate": 8.089430894308943e-07, "loss": 0.019, "num_tokens": 6163455.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.9490059614181519, "sampling/importance_sampling_ratio/mean": 0.9999033212661743, "sampling/importance_sampling_ratio/min": 0.480782151222229, "sampling/sampling_logp_difference/max": 0.7323410511016846, "sampling/sampling_logp_difference/mean": 0.01573982834815979, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 165.390625, "completions/mean_terminated_length": 165.390625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.23284420371055603, "epoch": 0.24632352941176472, "frac_reward_zero_std": 0.5, "grad_norm": 2.203527965455497, "kl": 0.012116186320781708, "learning_rate": 8.130081300813008e-07, "loss": -0.0002, "num_tokens": 6191192.0, "reward": 0.15625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7609137296676636, "sampling/importance_sampling_ratio/mean": 1.0000020265579224, "sampling/importance_sampling_ratio/min": 0.44058364629745483, "sampling/sampling_logp_difference/max": 0.8196549415588379, "sampling/sampling_logp_difference/mean": 0.01848536543548107, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 176.109375, "completions/mean_terminated_length": 176.109375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1951962411403656, "epoch": 0.24754901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 2.2565005795965583, "kl": 0.015520875342190266, "learning_rate": 8.170731707317072e-07, "loss": -0.0195, "num_tokens": 6221775.0, "reward": 0.0625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992324113845825, "sampling/importance_sampling_ratio/min": 0.5110778212547302, "sampling/sampling_logp_difference/max": 0.8343586921691895, "sampling/sampling_logp_difference/mean": 0.015476308763027191, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 131.625, "completions/mean_terminated_length": 131.625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.17135287821292877, "epoch": 0.24877450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 2.5963026554811197, "kl": 0.020184125751256943, "learning_rate": 8.211382113821138e-07, "loss": 0.0503, "num_tokens": 6245191.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6214877367019653, "sampling/importance_sampling_ratio/mean": 0.9992965459823608, "sampling/importance_sampling_ratio/min": 0.4405844509601593, "sampling/sampling_logp_difference/max": 0.8196531534194946, "sampling/sampling_logp_difference/mean": 0.01627563126385212, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 147.78125, "completions/mean_terminated_length": 147.78125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.17383095622062683, "epoch": 0.25, "frac_reward_zero_std": 0.75, "grad_norm": 1.3624706522297745, "kl": 0.015773724764585495, "learning_rate": 8.252032520325202e-07, "loss": -0.0004, "num_tokens": 6276105.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999467134475708, "sampling/importance_sampling_ratio/min": 0.4982483386993408, "sampling/sampling_logp_difference/max": 0.7709488868713379, "sampling/sampling_logp_difference/mean": 0.013784103095531464, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 140.171875, "completions/mean_terminated_length": 140.171875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.23402664065361023, "epoch": 0.2512254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.3533101667704015, "kl": 0.019832585006952286, "learning_rate": 8.292682926829268e-07, "loss": -0.0299, "num_tokens": 6300468.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006000995635986, "sampling/importance_sampling_ratio/min": 0.10227715969085693, "sampling/sampling_logp_difference/max": 2.280068874359131, "sampling/sampling_logp_difference/mean": 0.01633118838071823, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 165.015625, "completions/mean_terminated_length": 165.015625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.1390296220779419, "epoch": 0.25245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.05031020405828662, "kl": 0.011352060362696648, "learning_rate": 8.333333333333333e-07, "loss": 0.0001, "num_tokens": 6331157.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.615748405456543, "sampling/importance_sampling_ratio/mean": 1.000266432762146, "sampling/importance_sampling_ratio/min": 0.3780434727668762, "sampling/sampling_logp_difference/max": 0.9727461338043213, "sampling/sampling_logp_difference/mean": 0.010921423323452473, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 202.765625, "completions/mean_terminated_length": 202.765625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2006179839372635, "epoch": 0.2536764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.3644209320380678, "kl": 0.008472388610243797, "learning_rate": 8.373983739837398e-07, "loss": 0.042, "num_tokens": 6362310.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.710673451423645, "sampling/importance_sampling_ratio/mean": 0.9995939135551453, "sampling/importance_sampling_ratio/min": 0.490130215883255, "sampling/sampling_logp_difference/max": 0.7130842208862305, "sampling/sampling_logp_difference/mean": 0.015032893046736717, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 164.96875, "completions/mean_terminated_length": 164.96875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.22202682495117188, "epoch": 0.2549019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 2.177038224537697, "kl": 0.020616548135876656, "learning_rate": 8.414634146341463e-07, "loss": 0.0217, "num_tokens": 6387444.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997557401657104, "sampling/importance_sampling_ratio/min": 0.50432288646698, "sampling/sampling_logp_difference/max": 0.7088422775268555, "sampling/sampling_logp_difference/mean": 0.015844713896512985, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 141.640625, "completions/mean_terminated_length": 141.640625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2225850522518158, "epoch": 0.25612745098039214, "frac_reward_zero_std": 0.5, "grad_norm": 2.6814906793295235, "kl": 0.01864427700638771, "learning_rate": 8.455284552845529e-07, "loss": 0.0033, "num_tokens": 6411549.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8062304258346558, "sampling/importance_sampling_ratio/mean": 1.0005677938461304, "sampling/importance_sampling_ratio/min": 0.5038042068481445, "sampling/sampling_logp_difference/max": 0.6855676174163818, "sampling/sampling_logp_difference/mean": 0.017111271619796753, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 142.28125, "completions/mean_terminated_length": 142.28125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1608240008354187, "epoch": 0.25735294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 2.018945743347563, "kl": 0.013830848038196564, "learning_rate": 8.495934959349593e-07, "loss": 0.0104, "num_tokens": 6438687.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9990195631980896, "sampling/importance_sampling_ratio/min": 0.5116662383079529, "sampling/sampling_logp_difference/max": 0.7109653949737549, "sampling/sampling_logp_difference/mean": 0.014174254611134529, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 146.390625, "completions/mean_terminated_length": 146.390625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.20424455404281616, "epoch": 0.25857843137254904, "frac_reward_zero_std": 0.75, "grad_norm": 1.8196388901766183, "kl": 0.01573394425213337, "learning_rate": 8.536585365853657e-07, "loss": 0.0117, "num_tokens": 6466040.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9441547393798828, "sampling/importance_sampling_ratio/mean": 1.0002039670944214, "sampling/importance_sampling_ratio/min": 0.23401731252670288, "sampling/sampling_logp_difference/max": 1.4523601531982422, "sampling/sampling_logp_difference/mean": 0.01648673601448536, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 167.421875, "completions/mean_terminated_length": 167.421875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.17787525057792664, "epoch": 0.25980392156862747, "frac_reward_zero_std": 0.5, "grad_norm": 2.6141797340196433, "kl": 0.011871451511979103, "learning_rate": 8.577235772357723e-07, "loss": -0.0182, "num_tokens": 6491203.0, "reward": 0.4375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9012318849563599, "sampling/importance_sampling_ratio/mean": 0.9989027380943298, "sampling/importance_sampling_ratio/min": 0.41614916920661926, "sampling/sampling_logp_difference/max": 0.8767114877700806, "sampling/sampling_logp_difference/mean": 0.015307286754250526, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 197.71875, "completions/mean_terminated_length": 197.71875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.18765902519226074, "epoch": 0.2610294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.643178268300292, "kl": 0.015506663359701633, "learning_rate": 8.617886178861788e-07, "loss": 0.031, "num_tokens": 6525969.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998393058776855, "sampling/importance_sampling_ratio/min": 0.2954738736152649, "sampling/sampling_logp_difference/max": 1.219174861907959, "sampling/sampling_logp_difference/mean": 0.01343243382871151, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 173.984375, "completions/mean_terminated_length": 173.984375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2047746777534485, "epoch": 0.2622549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 2.5217419801274326, "kl": 0.022509688511490822, "learning_rate": 8.658536585365853e-07, "loss": 0.0669, "num_tokens": 6561760.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999178647994995, "sampling/importance_sampling_ratio/min": 0.5406491160392761, "sampling/sampling_logp_difference/max": 0.8521547317504883, "sampling/sampling_logp_difference/mean": 0.01589590311050415, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 161.78125, "completions/mean_terminated_length": 161.78125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.19016176462173462, "epoch": 0.26348039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 1.9519911585691987, "kl": 0.021345844492316246, "learning_rate": 8.699186991869918e-07, "loss": 0.0248, "num_tokens": 6599810.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6213493347167969, "sampling/importance_sampling_ratio/mean": 0.9995770454406738, "sampling/importance_sampling_ratio/min": 0.444172203540802, "sampling/sampling_logp_difference/max": 0.8115429878234863, "sampling/sampling_logp_difference/mean": 0.016685977578163147, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 210.53125, "completions/mean_terminated_length": 210.53125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.16471557319164276, "epoch": 0.2647058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.8635683806955845, "kl": 0.01851458102464676, "learning_rate": 8.739837398373984e-07, "loss": -0.1021, "num_tokens": 6631892.0, "reward": 0.1875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002044439315796, "sampling/importance_sampling_ratio/min": 0.38777610659599304, "sampling/sampling_logp_difference/max": 0.9473271369934082, "sampling/sampling_logp_difference/mean": 0.013975502923130989, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 185.1875, "completions/mean_terminated_length": 185.1875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.16010968387126923, "epoch": 0.2659313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.6897227310673995, "kl": 0.01930549368262291, "learning_rate": 8.780487804878048e-07, "loss": -0.0573, "num_tokens": 6662816.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.821642518043518, "sampling/importance_sampling_ratio/mean": 1.0006334781646729, "sampling/importance_sampling_ratio/min": 0.4810205101966858, "sampling/sampling_logp_difference/max": 0.7318453788757324, "sampling/sampling_logp_difference/mean": 0.012054579332470894, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 141.34375, "completions/mean_terminated_length": 141.34375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.17018850147724152, "epoch": 0.26715686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 2.0365420752399577, "kl": 0.0233754962682724, "learning_rate": 8.821138211382113e-07, "loss": -0.0094, "num_tokens": 6693078.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001654624938965, "sampling/importance_sampling_ratio/min": 0.40650513768196106, "sampling/sampling_logp_difference/max": 0.9001587629318237, "sampling/sampling_logp_difference/mean": 0.013843964785337448, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 167.828125, "completions/mean_terminated_length": 167.828125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.18336881697177887, "epoch": 0.26838235294117646, "frac_reward_zero_std": 0.75, "grad_norm": 1.934212915296, "kl": 0.01532343216240406, "learning_rate": 8.861788617886179e-07, "loss": 0.0647, "num_tokens": 6720331.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003345012664795, "sampling/importance_sampling_ratio/min": 0.4310505986213684, "sampling/sampling_logp_difference/max": 1.2630889415740967, "sampling/sampling_logp_difference/mean": 0.014666068367660046, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 145.59375, "completions/mean_terminated_length": 145.59375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.19705787301063538, "epoch": 0.2696078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.9169989542169759, "kl": 0.02064172551035881, "learning_rate": 8.902439024390244e-07, "loss": 0.0325, "num_tokens": 6747377.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996950030326843, "sampling/importance_sampling_ratio/min": 0.4441492259502411, "sampling/sampling_logp_difference/max": 0.8115947246551514, "sampling/sampling_logp_difference/mean": 0.014506572857499123, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 197.296875, "completions/mean_terminated_length": 197.296875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.26085400581359863, "epoch": 0.2708333333333333, "frac_reward_zero_std": 0.25, "grad_norm": 2.222035021598264, "kl": 0.03441544622182846, "learning_rate": 8.943089430894308e-07, "loss": 0.0238, "num_tokens": 6775668.0, "reward": -0.40625, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.9800100326538086, "sampling/importance_sampling_ratio/mean": 1.0002212524414062, "sampling/importance_sampling_ratio/min": 0.35668912529945374, "sampling/sampling_logp_difference/max": 1.030890703201294, "sampling/sampling_logp_difference/mean": 0.01797478087246418, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 116.515625, "completions/mean_terminated_length": 116.515625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.15425002574920654, "epoch": 0.27205882352941174, "frac_reward_zero_std": 0.75, "grad_norm": 1.856688339430402, "kl": 0.03441350907087326, "learning_rate": 8.983739837398373e-07, "loss": -0.0177, "num_tokens": 6802933.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6629855632781982, "sampling/importance_sampling_ratio/mean": 1.0002639293670654, "sampling/importance_sampling_ratio/min": 0.3411581218242645, "sampling/sampling_logp_difference/max": 1.075409173965454, "sampling/sampling_logp_difference/mean": 0.013100311160087585, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.21026542782783508, "epoch": 0.27328431372549017, "frac_reward_zero_std": 0.75, "grad_norm": 1.2633458308151913, "kl": 0.02543017454445362, "learning_rate": 9.024390243902439e-07, "loss": -0.0153, "num_tokens": 6833341.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999900221824646, "sampling/importance_sampling_ratio/min": 0.3941422700881958, "sampling/sampling_logp_difference/max": 0.9310433864593506, "sampling/sampling_logp_difference/mean": 0.016453826799988747, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 144.03125, "completions/mean_terminated_length": 144.03125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.19323155283927917, "epoch": 0.27450980392156865, "frac_reward_zero_std": 0.5, "grad_norm": 2.238740050855254, "kl": 0.041469473391771317, "learning_rate": 9.065040650406503e-07, "loss": 0.0099, "num_tokens": 6861551.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000789165496826, "sampling/importance_sampling_ratio/min": 0.3232390582561493, "sampling/sampling_logp_difference/max": 1.1293630599975586, "sampling/sampling_logp_difference/mean": 0.015687420964241028, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 164.359375, "completions/mean_terminated_length": 164.359375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.14599479734897614, "epoch": 0.2757352941176471, "frac_reward_zero_std": 0.75, "grad_norm": 1.8989134891792927, "kl": 0.018956230953335762, "learning_rate": 9.105691056910569e-07, "loss": -0.0124, "num_tokens": 6887334.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.817976713180542, "sampling/importance_sampling_ratio/mean": 0.9996324777603149, "sampling/importance_sampling_ratio/min": 0.376531183719635, "sampling/sampling_logp_difference/max": 0.9767544269561768, "sampling/sampling_logp_difference/mean": 0.012233897112309933, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 136.34375, "completions/mean_terminated_length": 136.34375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2081795334815979, "epoch": 0.2769607843137255, "frac_reward_zero_std": 0.25, "grad_norm": 3.678218026388671, "kl": 0.03165192902088165, "learning_rate": 9.146341463414634e-07, "loss": 0.0147, "num_tokens": 6912236.0, "reward": 0.125, "reward_std": 0.5651718378067017, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6185600757598877, "sampling/importance_sampling_ratio/mean": 1.0005383491516113, "sampling/importance_sampling_ratio/min": 0.6078590154647827, "sampling/sampling_logp_difference/max": 0.49781227111816406, "sampling/sampling_logp_difference/mean": 0.015303589403629303, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 139.28125, "completions/mean_terminated_length": 139.28125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.1498456597328186, "epoch": 0.27818627450980393, "frac_reward_zero_std": 0.25, "grad_norm": 3.3959461708268885, "kl": 0.05385677143931389, "learning_rate": 9.186991869918699e-07, "loss": 0.0689, "num_tokens": 6938574.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.527761459350586, "sampling/importance_sampling_ratio/mean": 0.9992302060127258, "sampling/importance_sampling_ratio/min": 0.0881451815366745, "sampling/sampling_logp_difference/max": 2.428770065307617, "sampling/sampling_logp_difference/mean": 0.015360962599515915, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.16509565711021423, "epoch": 0.27941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.08382376646922701, "kl": 0.018290940672159195, "learning_rate": 9.227642276422763e-07, "loss": 0.0002, "num_tokens": 6971622.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.769323706626892, "sampling/importance_sampling_ratio/mean": 0.9997942447662354, "sampling/importance_sampling_ratio/min": 0.5412097573280334, "sampling/sampling_logp_difference/max": 0.6139483451843262, "sampling/sampling_logp_difference/mean": 0.013449668884277344, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 152.71875, "completions/mean_terminated_length": 152.71875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.2042853981256485, "epoch": 0.2806372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 2.8192731831295728, "kl": 0.021124407649040222, "learning_rate": 9.26829268292683e-07, "loss": -0.0314, "num_tokens": 7001028.0, "reward": -0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000542402267456, "sampling/importance_sampling_ratio/min": 0.15975400805473328, "sampling/sampling_logp_difference/max": 2.1725785732269287, "sampling/sampling_logp_difference/mean": 0.017363186925649643, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.19672785699367523, "epoch": 0.2818627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 2.229775984302137, "kl": 0.014601796865463257, "learning_rate": 9.308943089430894e-07, "loss": -0.0424, "num_tokens": 7041636.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995140433311462, "sampling/importance_sampling_ratio/min": 0.398908406496048, "sampling/sampling_logp_difference/max": 0.9190235137939453, "sampling/sampling_logp_difference/mean": 0.016202237457036972, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.12926724553108215, "epoch": 0.28308823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.5619613064277984, "kl": 0.02092336118221283, "learning_rate": 9.349593495934958e-07, "loss": 0.0194, "num_tokens": 7070924.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8715226650238037, "sampling/importance_sampling_ratio/mean": 1.000416874885559, "sampling/importance_sampling_ratio/min": 0.38201507925987244, "sampling/sampling_logp_difference/max": 0.9622951745986938, "sampling/sampling_logp_difference/mean": 0.010455417446792126, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 131.3125, "completions/mean_terminated_length": 131.3125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.23128622770309448, "epoch": 0.28431372549019607, "frac_reward_zero_std": 0.25, "grad_norm": 2.84886119094527, "kl": 0.019218452274799347, "learning_rate": 9.390243902439024e-07, "loss": -0.0039, "num_tokens": 7099472.0, "reward": 0.25, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001335144042969, "sampling/importance_sampling_ratio/min": 0.488338440656662, "sampling/sampling_logp_difference/max": 0.7325534820556641, "sampling/sampling_logp_difference/mean": 0.01647172123193741, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.1792403757572174, "epoch": 0.2855392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.8270636713308035, "kl": 0.013000808656215668, "learning_rate": 9.430894308943089e-07, "loss": -0.0049, "num_tokens": 7123976.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997499585151672, "sampling/importance_sampling_ratio/min": 0.49598950147628784, "sampling/sampling_logp_difference/max": 0.7306704521179199, "sampling/sampling_logp_difference/mean": 0.013863112777471542, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 196.9375, "completions/mean_terminated_length": 196.9375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21675468981266022, "epoch": 0.2867647058823529, "frac_reward_zero_std": 0.25, "grad_norm": 2.4134308604205, "kl": 0.022405382245779037, "learning_rate": 9.471544715447154e-07, "loss": 0.0002, "num_tokens": 7158916.0, "reward": 0.625, "reward_std": 0.6047805547714233, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.602049708366394, "sampling/importance_sampling_ratio/mean": 0.9996261596679688, "sampling/importance_sampling_ratio/min": 0.4933438301086426, "sampling/sampling_logp_difference/max": 0.7065489292144775, "sampling/sampling_logp_difference/mean": 0.016713209450244904, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 148.296875, "completions/mean_terminated_length": 148.296875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.1597084403038025, "epoch": 0.28799019607843135, "frac_reward_zero_std": 0.5, "grad_norm": 1.7514324834207053, "kl": 0.016031622886657715, "learning_rate": 9.512195121951218e-07, "loss": -0.0055, "num_tokens": 7187783.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997620582580566, "sampling/importance_sampling_ratio/min": 0.4871583580970764, "sampling/sampling_logp_difference/max": 0.7891626358032227, "sampling/sampling_logp_difference/mean": 0.012351596727967262, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 137.40625, "completions/mean_terminated_length": 137.40625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2350776195526123, "epoch": 0.28921568627450983, "frac_reward_zero_std": 0.25, "grad_norm": 3.2546834242240266, "kl": 0.023296181112527847, "learning_rate": 9.552845528455285e-07, "loss": -0.0099, "num_tokens": 7211105.0, "reward": 0.15625, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7656652927398682, "sampling/importance_sampling_ratio/mean": 1.000077724456787, "sampling/importance_sampling_ratio/min": 0.3414660096168518, "sampling/sampling_logp_difference/max": 1.0745071172714233, "sampling/sampling_logp_difference/mean": 0.016961853951215744, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 212.578125, "completions/mean_terminated_length": 212.578125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.22627148032188416, "epoch": 0.29044117647058826, "frac_reward_zero_std": 0.75, "grad_norm": 1.2184433906931282, "kl": 0.013846682384610176, "learning_rate": 9.59349593495935e-07, "loss": -0.0401, "num_tokens": 7259302.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.998911440372467, "sampling/importance_sampling_ratio/min": 0.47263118624687195, "sampling/sampling_logp_difference/max": 0.8971564769744873, "sampling/sampling_logp_difference/mean": 0.01815243996679783, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 195.453125, "completions/mean_terminated_length": 195.453125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.20510563254356384, "epoch": 0.2916666666666667, "frac_reward_zero_std": 0.25, "grad_norm": 2.5715892032303334, "kl": 0.02353559248149395, "learning_rate": 9.634146341463414e-07, "loss": 0.0412, "num_tokens": 7286035.0, "reward": 0.0625, "reward_std": 0.5738953948020935, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.990490198135376, "sampling/importance_sampling_ratio/mean": 1.000231146812439, "sampling/importance_sampling_ratio/min": 0.3369840681552887, "sampling/sampling_logp_difference/max": 1.0877196788787842, "sampling/sampling_logp_difference/mean": 0.01640462502837181, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 190.953125, "completions/mean_terminated_length": 190.953125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.16903972625732422, "epoch": 0.2928921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 3.351155146835451, "kl": 0.013523912988603115, "learning_rate": 9.67479674796748e-07, "loss": 0.0591, "num_tokens": 7318320.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995356798171997, "sampling/importance_sampling_ratio/min": 0.1982087939977646, "sampling/sampling_logp_difference/max": 1.6184343099594116, "sampling/sampling_logp_difference/mean": 0.014733832329511642, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.203177809715271, "epoch": 0.29411764705882354, "frac_reward_zero_std": 0.25, "grad_norm": 2.4134120577411617, "kl": 0.009299861267209053, "learning_rate": 9.715447154471544e-07, "loss": -0.0012, "num_tokens": 7350944.0, "reward": -0.03125, "reward_std": 0.5431214570999146, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002261400222778, "sampling/importance_sampling_ratio/min": 0.3533068597316742, "sampling/sampling_logp_difference/max": 1.040418267250061, "sampling/sampling_logp_difference/mean": 0.014737317338585854, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 172.859375, "completions/mean_terminated_length": 172.859375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2432369738817215, "epoch": 0.29534313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 1.8605392135504952, "kl": 0.01595490612089634, "learning_rate": 9.756097560975609e-07, "loss": -0.0717, "num_tokens": 7378039.0, "reward": -0.09375, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005640983581543, "sampling/importance_sampling_ratio/min": 0.6027071475982666, "sampling/sampling_logp_difference/max": 0.784705638885498, "sampling/sampling_logp_difference/mean": 0.015564032830297947, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 172.9375, "completions/mean_terminated_length": 172.9375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.22795718908309937, "epoch": 0.2965686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 2.4576001673744328, "kl": 0.01724562793970108, "learning_rate": 9.796747967479673e-07, "loss": 0.0406, "num_tokens": 7414931.0, "reward": 0.625, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7553232908248901, "sampling/importance_sampling_ratio/mean": 0.9996345043182373, "sampling/importance_sampling_ratio/min": 0.43154290318489075, "sampling/sampling_logp_difference/max": 0.8403884172439575, "sampling/sampling_logp_difference/mean": 0.017535878345370293, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 183.546875, "completions/mean_terminated_length": 183.546875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.22533947229385376, "epoch": 0.2977941176470588, "frac_reward_zero_std": 0.25, "grad_norm": 3.1914303458815296, "kl": 0.017026592046022415, "learning_rate": 9.83739837398374e-07, "loss": 0.1802, "num_tokens": 7442566.0, "reward": 0.375, "reward_std": 0.6267197132110596, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997953772544861, "sampling/importance_sampling_ratio/min": 0.08131000399589539, "sampling/sampling_logp_difference/max": 2.509486198425293, "sampling/sampling_logp_difference/mean": 0.01854054443538189, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 166.78125, "completions/mean_terminated_length": 166.78125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.19965288043022156, "epoch": 0.29901960784313725, "frac_reward_zero_std": 0.5, "grad_norm": 2.9273974052382448, "kl": 0.0182357020676136, "learning_rate": 9.878048780487804e-07, "loss": 0.0259, "num_tokens": 7470472.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6897634267807007, "sampling/importance_sampling_ratio/mean": 1.0001301765441895, "sampling/importance_sampling_ratio/min": 0.5680594444274902, "sampling/sampling_logp_difference/max": 0.5655292272567749, "sampling/sampling_logp_difference/mean": 0.014180110767483711, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 222.296875, "completions/mean_terminated_length": 222.296875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.21197420358657837, "epoch": 0.3002450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.9902050280315178, "kl": 0.016592450439929962, "learning_rate": 9.918699186991869e-07, "loss": 0.0053, "num_tokens": 7510923.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000331163406372, "sampling/importance_sampling_ratio/min": 0.3722730576992035, "sampling/sampling_logp_difference/max": 0.9881277084350586, "sampling/sampling_logp_difference/mean": 0.016205525025725365, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 190.546875, "completions/mean_terminated_length": 190.546875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.28110504150390625, "epoch": 0.3014705882352941, "frac_reward_zero_std": 0.25, "grad_norm": 2.9143400473740906, "kl": 0.01969430036842823, "learning_rate": 9.959349593495935e-07, "loss": 0.0256, "num_tokens": 7544686.0, "reward": -0.03125, "reward_std": 0.6683381795883179, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.797054648399353, "sampling/importance_sampling_ratio/mean": 0.9998266696929932, "sampling/importance_sampling_ratio/min": 0.3880440294742584, "sampling/sampling_logp_difference/max": 0.946636438369751, "sampling/sampling_logp_difference/mean": 0.01925182156264782, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.10917695611715317, "epoch": 0.30269607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.04571589721542751, "kl": 0.013499004766345024, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 7572382.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000195503234863, "sampling/importance_sampling_ratio/min": 0.20237791538238525, "sampling/sampling_logp_difference/max": 1.5976184606552124, "sampling/sampling_logp_difference/mean": 0.008364669978618622, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 163.328125, "completions/mean_terminated_length": 163.328125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.22276531159877777, "epoch": 0.30392156862745096, "frac_reward_zero_std": 0.5, "grad_norm": 2.2225595181548026, "kl": 0.020731121301651, "learning_rate": 9.99999492515838e-07, "loss": -0.0246, "num_tokens": 7600355.0, "reward": -0.34375, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992364048957825, "sampling/importance_sampling_ratio/min": 0.5002468228340149, "sampling/sampling_logp_difference/max": 0.7977039813995361, "sampling/sampling_logp_difference/mean": 0.018636515364050865, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 228.640625, "completions/mean_terminated_length": 228.640625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2560403347015381, "epoch": 0.30514705882352944, "frac_reward_zero_std": 0.0, "grad_norm": 2.388117291352842, "kl": 0.01740189641714096, "learning_rate": 9.99997970064382e-07, "loss": -0.0331, "num_tokens": 7635852.0, "reward": 0.46875, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9314196109771729, "sampling/importance_sampling_ratio/mean": 1.00026273727417, "sampling/importance_sampling_ratio/min": 0.48761385679244995, "sampling/sampling_logp_difference/max": 0.7182314395904541, "sampling/sampling_logp_difference/mean": 0.017072558403015137, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 178.921875, "completions/mean_terminated_length": 178.921875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2835567593574524, "epoch": 0.30637254901960786, "frac_reward_zero_std": 0.25, "grad_norm": 3.0005930201165896, "kl": 0.02886389009654522, "learning_rate": 9.999954326487227e-07, "loss": 0.0229, "num_tokens": 7662375.0, "reward": 0.4375, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8641325235366821, "sampling/importance_sampling_ratio/mean": 0.9994344711303711, "sampling/importance_sampling_ratio/min": 0.5916401147842407, "sampling/sampling_logp_difference/max": 0.622795820236206, "sampling/sampling_logp_difference/mean": 0.017332665622234344, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 177.078125, "completions/mean_terminated_length": 177.078125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.19621199369430542, "epoch": 0.3075980392156863, "frac_reward_zero_std": 0.25, "grad_norm": 2.495969881060601, "kl": 0.021376606076955795, "learning_rate": 9.999918802740106e-07, "loss": 0.0438, "num_tokens": 7686092.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4804352521896362, "sampling/importance_sampling_ratio/mean": 1.0000429153442383, "sampling/importance_sampling_ratio/min": 0.5440237522125244, "sampling/sampling_logp_difference/max": 0.6087623834609985, "sampling/sampling_logp_difference/mean": 0.013395338319242, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 198.3125, "completions/mean_terminated_length": 198.3125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.14947807788848877, "epoch": 0.3088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.4286947189398673, "kl": 0.016668811440467834, "learning_rate": 9.999873129474573e-07, "loss": 0.0134, "num_tokens": 7718192.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999808669090271, "sampling/importance_sampling_ratio/min": 0.4353545904159546, "sampling/sampling_logp_difference/max": 0.8583614826202393, "sampling/sampling_logp_difference/mean": 0.012718813493847847, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 233.1875, "completions/mean_terminated_length": 233.1875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2590774595737457, "epoch": 0.31004901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 2.371577105170613, "kl": 0.018302420154213905, "learning_rate": 9.999817306783336e-07, "loss": 0.0031, "num_tokens": 7748204.0, "reward": 0.5, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6542383432388306, "sampling/importance_sampling_ratio/mean": 0.9993072748184204, "sampling/importance_sampling_ratio/min": 0.4976939857006073, "sampling/sampling_logp_difference/max": 0.6977698802947998, "sampling/sampling_logp_difference/mean": 0.015687517821788788, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 153.859375, "completions/mean_terminated_length": 153.859375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.14650636911392212, "epoch": 0.3112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.03853809124553455, "kl": 0.016221893951296806, "learning_rate": 9.999751334779714e-07, "loss": 0.0002, "num_tokens": 7772627.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000417947769165, "sampling/importance_sampling_ratio/min": 0.4323920011520386, "sampling/sampling_logp_difference/max": 1.1542718410491943, "sampling/sampling_logp_difference/mean": 0.010418561287224293, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 151.9375, "completions/mean_terminated_length": 151.9375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.22642837464809418, "epoch": 0.3125, "frac_reward_zero_std": 0.75, "grad_norm": 1.4917039896100186, "kl": 0.03227543085813522, "learning_rate": 9.999675213597626e-07, "loss": 0.0128, "num_tokens": 7801695.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4883091449737549, "sampling/importance_sampling_ratio/mean": 1.0000696182250977, "sampling/importance_sampling_ratio/min": 0.3592665493488312, "sampling/sampling_logp_difference/max": 1.0236907005310059, "sampling/sampling_logp_difference/mean": 0.016973216086626053, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 180.265625, "completions/mean_terminated_length": 180.265625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.18876762688159943, "epoch": 0.3137254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.3133944861645632, "kl": 0.02303905412554741, "learning_rate": 9.999588943391595e-07, "loss": -0.0206, "num_tokens": 7831184.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992313385009766, "sampling/importance_sampling_ratio/min": 0.4441002309322357, "sampling/sampling_logp_difference/max": 0.9530837535858154, "sampling/sampling_logp_difference/mean": 0.014563923701643944, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 202.59375, "completions/mean_terminated_length": 202.59375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.22054433822631836, "epoch": 0.31495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.3198213503704428, "kl": 0.03424456715583801, "learning_rate": 9.999492524336742e-07, "loss": -0.0358, "num_tokens": 7858454.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6623640060424805, "sampling/importance_sampling_ratio/mean": 0.9998559355735779, "sampling/importance_sampling_ratio/min": 0.5483723282814026, "sampling/sampling_logp_difference/max": 0.6008007526397705, "sampling/sampling_logp_difference/mean": 0.013884920626878738, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.23330222070217133, "epoch": 0.3161764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.469206670551493, "kl": 0.04369250684976578, "learning_rate": 9.999385956628792e-07, "loss": 0.0103, "num_tokens": 7885478.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007914304733276, "sampling/importance_sampling_ratio/min": 0.4159523844718933, "sampling/sampling_logp_difference/max": 0.8771844506263733, "sampling/sampling_logp_difference/mean": 0.016755102202296257, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 192.078125, "completions/mean_terminated_length": 192.078125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.1585007905960083, "epoch": 0.3174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.6558902324721223, "kl": 0.045375652611255646, "learning_rate": 9.999269240484069e-07, "loss": -0.0616, "num_tokens": 7915627.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9510289430618286, "sampling/importance_sampling_ratio/mean": 1.0000808238983154, "sampling/importance_sampling_ratio/min": 0.3604038953781128, "sampling/sampling_logp_difference/max": 1.0205299854278564, "sampling/sampling_logp_difference/mean": 0.0157431922852993, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 212.59375, "completions/mean_terminated_length": 212.59375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.20303696393966675, "epoch": 0.31862745098039214, "frac_reward_zero_std": 0.75, "grad_norm": 1.4575941251694466, "kl": 0.029700467362999916, "learning_rate": 9.999142376139503e-07, "loss": 0.0139, "num_tokens": 7949601.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002882480621338, "sampling/importance_sampling_ratio/min": 0.4251917004585266, "sampling/sampling_logp_difference/max": 0.8552151918411255, "sampling/sampling_logp_difference/mean": 0.014961793087422848, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 142.90625, "completions/mean_terminated_length": 142.90625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.15443193912506104, "epoch": 0.31985294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.07583212926367194, "kl": 0.04153234511613846, "learning_rate": 9.999005363852617e-07, "loss": 0.0004, "num_tokens": 7974603.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000119686126709, "sampling/importance_sampling_ratio/min": 0.4441169500350952, "sampling/sampling_logp_difference/max": 0.8772122859954834, "sampling/sampling_logp_difference/mean": 0.012940686196088791, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.1729099452495575, "epoch": 0.32107843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.053998073517434415, "kl": 0.02735218033194542, "learning_rate": 9.99885820390154e-07, "loss": 0.0003, "num_tokens": 8008011.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006272792816162, "sampling/importance_sampling_ratio/min": 0.16875389218330383, "sampling/sampling_logp_difference/max": 1.7793139219284058, "sampling/sampling_logp_difference/mean": 0.013979100622236729, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 240.171875, "completions/mean_terminated_length": 240.171875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2443355768918991, "epoch": 0.32230392156862747, "frac_reward_zero_std": 0.75, "grad_norm": 1.43946002353813, "kl": 0.026139382272958755, "learning_rate": 9.998700896584995e-07, "loss": 0.0006, "num_tokens": 8043670.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8476462364196777, "sampling/importance_sampling_ratio/mean": 1.000669002532959, "sampling/importance_sampling_ratio/min": 0.4119199812412262, "sampling/sampling_logp_difference/max": 0.8869261741638184, "sampling/sampling_logp_difference/mean": 0.01667368784546852, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 290.015625, "completions/mean_terminated_length": 290.015625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.23038527369499207, "epoch": 0.3235294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 3.2449243314746616, "kl": 0.054875560104846954, "learning_rate": 9.998533442222308e-07, "loss": 0.0311, "num_tokens": 8081463.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998653531074524, "sampling/importance_sampling_ratio/min": 0.0009191891294904053, "sampling/sampling_logp_difference/max": 6.992018699645996, "sampling/sampling_logp_difference/mean": 0.016055557876825333, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.16996632516384125, "epoch": 0.3247549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.183405230529886, "kl": 0.036977771669626236, "learning_rate": 9.9983558411534e-07, "loss": 0.0049, "num_tokens": 8113151.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6553928852081299, "sampling/importance_sampling_ratio/mean": 0.9998458027839661, "sampling/importance_sampling_ratio/min": 0.4956169128417969, "sampling/sampling_logp_difference/max": 0.7019519805908203, "sampling/sampling_logp_difference/mean": 0.011236455291509628, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 196.109375, "completions/mean_terminated_length": 196.109375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1882980465888977, "epoch": 0.32598039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 1.6896309395623348, "kl": 0.054088227450847626, "learning_rate": 9.99816809373879e-07, "loss": 0.0229, "num_tokens": 8145990.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001240968704224, "sampling/importance_sampling_ratio/min": 0.24144580960273743, "sampling/sampling_logp_difference/max": 1.4211102724075317, "sampling/sampling_logp_difference/mean": 0.01539707649499178, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 211.15625, "completions/mean_terminated_length": 211.15625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.24172553420066833, "epoch": 0.3272058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 2.4090101474163363, "kl": 0.0678095892071724, "learning_rate": 9.99797020035959e-07, "loss": -0.0382, "num_tokens": 8178928.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001152515411377, "sampling/importance_sampling_ratio/min": 0.4870632290840149, "sampling/sampling_logp_difference/max": 0.7327721118927002, "sampling/sampling_logp_difference/mean": 0.017825480550527573, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 264.78125, "completions/mean_terminated_length": 264.78125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.16782930493354797, "epoch": 0.3284313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.2461276023549521, "kl": 0.021782714873552322, "learning_rate": 9.997762161417517e-07, "loss": -0.0767, "num_tokens": 8213570.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000859260559082, "sampling/importance_sampling_ratio/min": 0.3568447530269623, "sampling/sampling_logp_difference/max": 1.0304545164108276, "sampling/sampling_logp_difference/mean": 0.012895837426185608, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 275.84375, "completions/mean_terminated_length": 275.84375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.187491774559021, "epoch": 0.32965686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.06064687623075509, "kl": 0.035978008061647415, "learning_rate": 9.997543977334873e-07, "loss": 0.0004, "num_tokens": 8256648.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998468160629272, "sampling/importance_sampling_ratio/min": 0.37678810954093933, "sampling/sampling_logp_difference/max": 0.9760723114013672, "sampling/sampling_logp_difference/mean": 0.01476958580315113, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 216.703125, "completions/mean_terminated_length": 216.703125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.23430466651916504, "epoch": 0.33088235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.06133336236799135, "kl": 0.02953743189573288, "learning_rate": 9.99731564855456e-07, "loss": 0.0003, "num_tokens": 8287189.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994462728500366, "sampling/importance_sampling_ratio/min": 0.3390989899635315, "sampling/sampling_logp_difference/max": 1.0814632177352905, "sampling/sampling_logp_difference/mean": 0.0169600211083889, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 307.265625, "completions/mean_terminated_length": 307.265625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.14084765315055847, "epoch": 0.3321078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.048327314027760665, "kl": 0.02697977051138878, "learning_rate": 9.997077175540066e-07, "loss": 0.0002, "num_tokens": 8326342.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.913650393486023, "sampling/importance_sampling_ratio/mean": 1.000585675239563, "sampling/importance_sampling_ratio/min": 0.47834527492523193, "sampling/sampling_logp_difference/max": 0.7374224662780762, "sampling/sampling_logp_difference/mean": 0.012069791555404663, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 183.015625, "completions/mean_terminated_length": 183.015625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.18567748367786407, "epoch": 0.3333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0970298499771554, "kl": 0.0578538179397583, "learning_rate": 9.996828558775485e-07, "loss": 0.0005, "num_tokens": 8358983.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7512283325195312, "sampling/importance_sampling_ratio/mean": 0.9999632835388184, "sampling/importance_sampling_ratio/min": 0.4345874786376953, "sampling/sampling_logp_difference/max": 0.8333580493927002, "sampling/sampling_logp_difference/mean": 0.014503656886518002, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 224.859375, "completions/mean_terminated_length": 224.859375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.20408973097801208, "epoch": 0.33455882352941174, "frac_reward_zero_std": 0.75, "grad_norm": 1.1503994156290938, "kl": 0.0713062733411789, "learning_rate": 9.996569798765487e-07, "loss": 0.0104, "num_tokens": 8388094.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8720554113388062, "sampling/importance_sampling_ratio/mean": 1.0004630088806152, "sampling/importance_sampling_ratio/min": 0.38671213388442993, "sampling/sampling_logp_difference/max": 0.9500746726989746, "sampling/sampling_logp_difference/mean": 0.014154992997646332, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 217.65625, "completions/mean_terminated_length": 217.65625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.19104424118995667, "epoch": 0.33578431372549017, "frac_reward_zero_std": 1.0, "grad_norm": 0.059019722270121706, "kl": 0.027294889092445374, "learning_rate": 9.996300896035338e-07, "loss": 0.0003, "num_tokens": 8416872.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.745153784751892, "sampling/importance_sampling_ratio/mean": 0.9995296001434326, "sampling/importance_sampling_ratio/min": 0.3825327455997467, "sampling/sampling_logp_difference/max": 0.9609410762786865, "sampling/sampling_logp_difference/mean": 0.013582659885287285, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 261.53125, "completions/mean_terminated_length": 261.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15209710597991943, "epoch": 0.33700980392156865, "frac_reward_zero_std": 1.0, "grad_norm": 0.05488378670977295, "kl": 0.024721980094909668, "learning_rate": 9.996021851130896e-07, "loss": 0.0003, "num_tokens": 8451658.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9497171640396118, "sampling/importance_sampling_ratio/mean": 1.0000152587890625, "sampling/importance_sampling_ratio/min": 0.23663154244422913, "sampling/sampling_logp_difference/max": 1.4412510395050049, "sampling/sampling_logp_difference/mean": 0.012828035280108452, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 211.59375, "completions/mean_terminated_length": 211.59375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.1754235327243805, "epoch": 0.3382352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.06723153773365091, "kl": 0.03830999881029129, "learning_rate": 9.995732664618603e-07, "loss": 0.0004, "num_tokens": 8497664.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000011682510376, "sampling/importance_sampling_ratio/min": 0.3859061598777771, "sampling/sampling_logp_difference/max": 0.9521610736846924, "sampling/sampling_logp_difference/mean": 0.015094295144081116, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 288.453125, "completions/mean_terminated_length": 288.453125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.16961298882961273, "epoch": 0.3394607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 1.5767552367347628, "kl": 0.039779193699359894, "learning_rate": 9.99543333708549e-07, "loss": 0.0023, "num_tokens": 8532941.0, "reward": 0.03125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998610019683838, "sampling/importance_sampling_ratio/min": 0.17132364213466644, "sampling/sampling_logp_difference/max": 1.7642008066177368, "sampling/sampling_logp_difference/mean": 0.012929710559546947, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 206.265625, "completions/mean_terminated_length": 206.265625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.1733682006597519, "epoch": 0.34068627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.05794989946174121, "kl": 0.0274711512029171, "learning_rate": 9.995123869139176e-07, "loss": 0.0002, "num_tokens": 8559566.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992111921310425, "sampling/importance_sampling_ratio/min": 0.10852707177400589, "sampling/sampling_logp_difference/max": 2.2207555770874023, "sampling/sampling_logp_difference/mean": 0.014123711735010147, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 234.46875, "completions/mean_terminated_length": 234.46875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.23664844036102295, "epoch": 0.34191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0550953417439475, "kl": 0.03200116753578186, "learning_rate": 9.994804261407854e-07, "loss": 0.0003, "num_tokens": 8600412.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7216484546661377, "sampling/importance_sampling_ratio/mean": 1.0006502866744995, "sampling/importance_sampling_ratio/min": 0.3945484459400177, "sampling/sampling_logp_difference/max": 0.9300134181976318, "sampling/sampling_logp_difference/mean": 0.017763927578926086, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 138.734375, "completions/mean_terminated_length": 138.734375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.18176689743995667, "epoch": 0.3431372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.5866845245694385, "kl": 0.0721161812543869, "learning_rate": 9.994474514540312e-07, "loss": 0.0195, "num_tokens": 8632811.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999579191207886, "sampling/importance_sampling_ratio/min": 0.5019660592079163, "sampling/sampling_logp_difference/max": 0.7264273166656494, "sampling/sampling_logp_difference/mean": 0.01411209162324667, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 331.859375, "completions/mean_terminated_length": 331.859375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.18779036402702332, "epoch": 0.3443627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.8192113355382399, "kl": 0.04017651081085205, "learning_rate": 9.994134629205917e-07, "loss": -0.0012, "num_tokens": 8671138.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7152165174484253, "sampling/importance_sampling_ratio/mean": 1.0000743865966797, "sampling/importance_sampling_ratio/min": 0.3180188238620758, "sampling/sampling_logp_difference/max": 1.1456446647644043, "sampling/sampling_logp_difference/mean": 0.012649483978748322, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1427.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 379.546875, "completions/mean_terminated_length": 379.546875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.14155760407447815, "epoch": 0.34558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.07669496214440431, "kl": 0.03512256219983101, "learning_rate": 9.99378460609461e-07, "loss": 0.0002, "num_tokens": 8709973.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.752143383026123, "sampling/importance_sampling_ratio/mean": 0.9998910427093506, "sampling/importance_sampling_ratio/min": 0.48715895414352417, "sampling/sampling_logp_difference/max": 0.7191648483276367, "sampling/sampling_logp_difference/mean": 0.011532300151884556, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 332.921875, "completions/mean_terminated_length": 332.921875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.26236775517463684, "epoch": 0.34681372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 1.4283450163176932, "kl": 0.034624628722667694, "learning_rate": 9.993424445916922e-07, "loss": 0.008, "num_tokens": 8748688.0, "reward": -0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9392987489700317, "sampling/importance_sampling_ratio/mean": 0.9999065399169922, "sampling/importance_sampling_ratio/min": 0.43108800053596497, "sampling/sampling_logp_difference/max": 0.8414430618286133, "sampling/sampling_logp_difference/mean": 0.01542898640036583, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 183.96875, "completions/mean_terminated_length": 183.96875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2028695046901703, "epoch": 0.3480392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.07872260060535358, "kl": 0.029936885461211205, "learning_rate": 9.993054149403949e-07, "loss": 0.0003, "num_tokens": 8777182.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003210306167603, "sampling/importance_sampling_ratio/min": 0.6080567240715027, "sampling/sampling_logp_difference/max": 0.7567486763000488, "sampling/sampling_logp_difference/mean": 0.014912188053131104, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 179.4375, "completions/mean_terminated_length": 179.4375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2501886785030365, "epoch": 0.3492647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 1.352352338282128, "kl": 0.04442233592271805, "learning_rate": 9.992673717307372e-07, "loss": -0.0322, "num_tokens": 8805418.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996594786643982, "sampling/importance_sampling_ratio/min": 0.4986007809638977, "sampling/sampling_logp_difference/max": 1.569200038909912, "sampling/sampling_logp_difference/mean": 0.016398219391703606, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 317.25, "completions/mean_terminated_length": 317.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.25562727451324463, "epoch": 0.35049019607843135, "frac_reward_zero_std": 1.0, "grad_norm": 0.03983127866431625, "kl": 0.01921760104596615, "learning_rate": 9.992283150399446e-07, "loss": 0.0002, "num_tokens": 8845498.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999561071395874, "sampling/importance_sampling_ratio/min": 0.41906750202178955, "sampling/sampling_logp_difference/max": 0.8697233200073242, "sampling/sampling_logp_difference/mean": 0.014112534001469612, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 181.78125, "completions/mean_terminated_length": 181.78125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1340177357196808, "epoch": 0.35171568627450983, "frac_reward_zero_std": 1.0, "grad_norm": 0.09420534167407238, "kl": 0.04165516048669815, "learning_rate": 9.991882449472994e-07, "loss": 0.0004, "num_tokens": 8871020.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9570924043655396, "sampling/importance_sampling_ratio/mean": 0.9995983839035034, "sampling/importance_sampling_ratio/min": 0.3091437816619873, "sampling/sampling_logp_difference/max": 1.1739487648010254, "sampling/sampling_logp_difference/mean": 0.011645602062344551, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 302.84375, "completions/mean_terminated_length": 302.84375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.16266974806785583, "epoch": 0.35294117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.07678208797274977, "kl": 0.03029218316078186, "learning_rate": 9.991471615341415e-07, "loss": 0.0003, "num_tokens": 8908882.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000216960906982, "sampling/importance_sampling_ratio/min": 0.5609308481216431, "sampling/sampling_logp_difference/max": 0.7711536884307861, "sampling/sampling_logp_difference/mean": 0.010535717010498047, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.16066837310791016, "epoch": 0.3541666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 1.0617927358175623, "kl": 0.02349277213215828, "learning_rate": 9.991050648838675e-07, "loss": 0.0104, "num_tokens": 8953658.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8624604940414429, "sampling/importance_sampling_ratio/mean": 1.0008823871612549, "sampling/importance_sampling_ratio/min": 0.2730712294578552, "sampling/sampling_logp_difference/max": 1.2980226278305054, "sampling/sampling_logp_difference/mean": 0.012821042910218239, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 242.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22517183423042297, "epoch": 0.3553921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.8833675798052343, "kl": 0.03744828328490257, "learning_rate": 9.990619550819312e-07, "loss": 0.0569, "num_tokens": 8989802.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999470114707947, "sampling/importance_sampling_ratio/min": 0.439327210187912, "sampling/sampling_logp_difference/max": 0.8553783893585205, "sampling/sampling_logp_difference/mean": 0.014835933223366737, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 264.34375, "completions/mean_terminated_length": 264.34375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.17973962426185608, "epoch": 0.35661764705882354, "frac_reward_zero_std": 0.75, "grad_norm": 1.1682813575991569, "kl": 0.04859580099582672, "learning_rate": 9.990178322158424e-07, "loss": -0.005, "num_tokens": 9022816.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8273805379867554, "sampling/importance_sampling_ratio/mean": 0.9987258315086365, "sampling/importance_sampling_ratio/min": 0.41369491815567017, "sampling/sampling_logp_difference/max": 0.8826265335083008, "sampling/sampling_logp_difference/mean": 0.014202505350112915, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 268.03125, "completions/mean_terminated_length": 268.03125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.21925997734069824, "epoch": 0.35784313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.07576893404582719, "kl": 0.03912314772605896, "learning_rate": 9.989726963751682e-07, "loss": 0.0003, "num_tokens": 9061986.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995810985565186, "sampling/importance_sampling_ratio/min": 0.45204266905784607, "sampling/sampling_logp_difference/max": 0.9608368873596191, "sampling/sampling_logp_difference/mean": 0.01378319039940834, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 278.234375, "completions/mean_terminated_length": 278.234375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.17324486374855042, "epoch": 0.3590686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.8174386073848596, "kl": 0.031908657401800156, "learning_rate": 9.989265476515309e-07, "loss": -0.0612, "num_tokens": 9099761.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.9097315073013306, "sampling/importance_sampling_ratio/mean": 1.000276803970337, "sampling/importance_sampling_ratio/min": 0.2784660756587982, "sampling/sampling_logp_difference/max": 1.278459072113037, "sampling/sampling_logp_difference/mean": 0.01174275204539299, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2010985016822815, "epoch": 0.3602941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1588447696198672, "kl": 0.02160612866282463, "learning_rate": 9.9887938613861e-07, "loss": 0.0023, "num_tokens": 9143521.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.87212336063385, "sampling/importance_sampling_ratio/mean": 1.0001239776611328, "sampling/importance_sampling_ratio/min": 0.44800442457199097, "sampling/sampling_logp_difference/max": 0.802952229976654, "sampling/sampling_logp_difference/mean": 0.012264113873243332, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 212.765625, "completions/mean_terminated_length": 212.765625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2683764100074768, "epoch": 0.36151960784313725, "frac_reward_zero_std": 1.0, "grad_norm": 0.07303974922055913, "kl": 0.04338815063238144, "learning_rate": 9.988312119321402e-07, "loss": 0.0004, "num_tokens": 9172114.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6276811361312866, "sampling/importance_sampling_ratio/mean": 1.0005288124084473, "sampling/importance_sampling_ratio/min": 0.5363470911979675, "sampling/sampling_logp_difference/max": 0.6229737997055054, "sampling/sampling_logp_difference/mean": 0.015231205150485039, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 258.796875, "completions/mean_terminated_length": 258.796875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.26148664951324463, "epoch": 0.3627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.1008567391160247, "kl": 0.03650952875614166, "learning_rate": 9.98782025129912e-07, "loss": 0.019, "num_tokens": 9205589.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9441547393798828, "sampling/importance_sampling_ratio/mean": 0.9999445676803589, "sampling/importance_sampling_ratio/min": 0.4872017800807953, "sampling/sampling_logp_difference/max": 0.7190768718719482, "sampling/sampling_logp_difference/mean": 0.014916934072971344, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 223.484375, "completions/mean_terminated_length": 223.484375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2084149718284607, "epoch": 0.3639705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.926986647314951, "kl": 0.03650977462530136, "learning_rate": 9.987318258317715e-07, "loss": 0.0132, "num_tokens": 9235124.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8333377838134766, "sampling/importance_sampling_ratio/mean": 1.0002769231796265, "sampling/importance_sampling_ratio/min": 0.5615825057029724, "sampling/sampling_logp_difference/max": 0.6061382293701172, "sampling/sampling_logp_difference/mean": 0.01239529624581337, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 185.703125, "completions/mean_terminated_length": 185.703125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.21163314580917358, "epoch": 0.36519607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.07274281159547615, "kl": 0.03799588233232498, "learning_rate": 9.986806141396205e-07, "loss": 0.0004, "num_tokens": 9264353.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.944161295890808, "sampling/importance_sampling_ratio/mean": 1.000054121017456, "sampling/importance_sampling_ratio/min": 0.4457568824291229, "sampling/sampling_logp_difference/max": 0.8079816102981567, "sampling/sampling_logp_difference/mean": 0.014065688475966454, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 218.78125, "completions/mean_terminated_length": 218.78125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2359258532524109, "epoch": 0.36642156862745096, "frac_reward_zero_std": 0.75, "grad_norm": 1.229085798192281, "kl": 0.05050275847315788, "learning_rate": 9.986283901574149e-07, "loss": -0.025, "num_tokens": 9294499.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994552135467529, "sampling/importance_sampling_ratio/min": 0.48804524540901184, "sampling/sampling_logp_difference/max": 0.7173471450805664, "sampling/sampling_logp_difference/mean": 0.015633976086974144, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 213.6875, "completions/mean_terminated_length": 213.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2758669853210449, "epoch": 0.36764705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 1.2045180532016053, "kl": 0.04507840797305107, "learning_rate": 9.985751539911664e-07, "loss": 0.0005, "num_tokens": 9326383.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6551718711853027, "sampling/importance_sampling_ratio/mean": 1.0001018047332764, "sampling/importance_sampling_ratio/min": 0.5446324348449707, "sampling/sampling_logp_difference/max": 0.6076440811157227, "sampling/sampling_logp_difference/mean": 0.014851532876491547, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 217.65625, "completions/mean_terminated_length": 217.65625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2571743130683899, "epoch": 0.36887254901960786, "frac_reward_zero_std": 1.0, "grad_norm": 0.05078752455404774, "kl": 0.04069396108388901, "learning_rate": 9.985209057489408e-07, "loss": 0.0004, "num_tokens": 9359049.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8130477666854858, "sampling/importance_sampling_ratio/mean": 1.0003347396850586, "sampling/importance_sampling_ratio/min": 0.4874882102012634, "sampling/sampling_logp_difference/max": 0.7184891700744629, "sampling/sampling_logp_difference/mean": 0.015300736762583256, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 194.171875, "completions/mean_terminated_length": 194.171875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.21515023708343506, "epoch": 0.3700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 2.757122950918929, "kl": 0.054561879485845566, "learning_rate": 9.98465645540859e-07, "loss": 0.0068, "num_tokens": 9387956.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000688910484314, "sampling/importance_sampling_ratio/min": 0.38900381326675415, "sampling/sampling_logp_difference/max": 0.9441661834716797, "sampling/sampling_logp_difference/mean": 0.01539285946637392, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 244.0625, "completions/mean_terminated_length": 244.0625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2658084034919739, "epoch": 0.3713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.046382865558398226, "kl": 0.043975912034511566, "learning_rate": 9.984093734790954e-07, "loss": 0.0004, "num_tokens": 9423800.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8894855976104736, "sampling/importance_sampling_ratio/mean": 0.999910831451416, "sampling/importance_sampling_ratio/min": 0.6174265742301941, "sampling/sampling_logp_difference/max": 0.6363046169281006, "sampling/sampling_logp_difference/mean": 0.015923671424388885, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 206.28125, "completions/mean_terminated_length": 206.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.24521231651306152, "epoch": 0.37254901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.06401306952952608, "kl": 0.05235999822616577, "learning_rate": 9.983520896778788e-07, "loss": 0.0005, "num_tokens": 9461882.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997357726097107, "sampling/importance_sampling_ratio/min": 0.380566269159317, "sampling/sampling_logp_difference/max": 0.966094970703125, "sampling/sampling_logp_difference/mean": 0.016459612175822258, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 162.828125, "completions/mean_terminated_length": 162.828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2646932899951935, "epoch": 0.3737745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.08126708221862732, "kl": 0.09018871188163757, "learning_rate": 9.982937942534917e-07, "loss": 0.0009, "num_tokens": 9488943.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002355575561523, "sampling/importance_sampling_ratio/min": 0.2654292583465576, "sampling/sampling_logp_difference/max": 1.3264069557189941, "sampling/sampling_logp_difference/mean": 0.016122879460453987, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 179.53125, "completions/mean_terminated_length": 179.53125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.24329300224781036, "epoch": 0.375, "frac_reward_zero_std": 0.75, "grad_norm": 1.4688878428154875, "kl": 0.07533843070268631, "learning_rate": 9.982344873242701e-07, "loss": 0.0022, "num_tokens": 9516065.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5966145992279053, "sampling/importance_sampling_ratio/mean": 1.000145673751831, "sampling/importance_sampling_ratio/min": 0.6077024936676025, "sampling/sampling_logp_difference/max": 0.49806976318359375, "sampling/sampling_logp_difference/mean": 0.014692382887005806, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 172.265625, "completions/mean_terminated_length": 172.265625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.1821404993534088, "epoch": 0.3762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.054276720759989065, "kl": 0.047477252781391144, "learning_rate": 9.981741690106034e-07, "loss": 0.0005, "num_tokens": 9546386.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002491474151611, "sampling/importance_sampling_ratio/min": 0.6056228876113892, "sampling/sampling_logp_difference/max": 0.7309679985046387, "sampling/sampling_logp_difference/mean": 0.012202339246869087, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 209.234375, "completions/mean_terminated_length": 209.234375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.18036088347434998, "epoch": 0.37745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.037200787937837765, "kl": 0.0318170040845871, "learning_rate": 9.981128394349337e-07, "loss": 0.0003, "num_tokens": 9576913.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8171582221984863, "sampling/importance_sampling_ratio/mean": 1.000547170639038, "sampling/importance_sampling_ratio/min": 0.5328795313835144, "sampling/sampling_logp_difference/max": 0.6294598579406738, "sampling/sampling_logp_difference/mean": 0.012505259364843369, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 199.28125, "completions/mean_terminated_length": 199.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2590799629688263, "epoch": 0.3786764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.412172841260048, "kl": 0.05636392906308174, "learning_rate": 9.980504987217566e-07, "loss": -0.0429, "num_tokens": 9604531.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002868175506592, "sampling/importance_sampling_ratio/min": 0.6122801899909973, "sampling/sampling_logp_difference/max": 1.017359733581543, "sampling/sampling_logp_difference/mean": 0.01534392312169075, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 252.109375, "completions/mean_terminated_length": 252.109375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2604590058326721, "epoch": 0.3799019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.14569476673964082, "kl": 0.06339927017688751, "learning_rate": 9.979871469976195e-07, "loss": 0.0006, "num_tokens": 9642266.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999227523803711, "sampling/importance_sampling_ratio/min": 0.5187325477600098, "sampling/sampling_logp_difference/max": 0.7288380861282349, "sampling/sampling_logp_difference/mean": 0.015960384160280228, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 159.921875, "completions/mean_terminated_length": 159.921875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.23250165581703186, "epoch": 0.38112745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0633905125342962, "kl": 0.061814285814762115, "learning_rate": 9.979227843911224e-07, "loss": 0.0006, "num_tokens": 9674741.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4908007383346558, "sampling/importance_sampling_ratio/mean": 0.9994626045227051, "sampling/importance_sampling_ratio/min": 0.5913910269737244, "sampling/sampling_logp_difference/max": 0.5252777934074402, "sampling/sampling_logp_difference/mean": 0.014160791411995888, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 224.90625, "completions/mean_terminated_length": 224.90625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2519511580467224, "epoch": 0.38235294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 1.4181004390709577, "kl": 0.04638572037220001, "learning_rate": 9.978574110329172e-07, "loss": -0.0059, "num_tokens": 9712223.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.9753763675689697, "sampling/importance_sampling_ratio/mean": 1.0001909732818604, "sampling/importance_sampling_ratio/min": 0.4974619448184967, "sampling/sampling_logp_difference/max": 0.6982362270355225, "sampling/sampling_logp_difference/mean": 0.014956100843846798, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 181.15625, "completions/mean_terminated_length": 181.15625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.24080537259578705, "epoch": 0.38357843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.06269812108988729, "kl": 0.05156532675027847, "learning_rate": 9.977910270557078e-07, "loss": 0.0005, "num_tokens": 9745305.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7569202184677124, "sampling/importance_sampling_ratio/mean": 0.9995178580284119, "sampling/importance_sampling_ratio/min": 0.42265576124191284, "sampling/sampling_logp_difference/max": 0.8611972332000732, "sampling/sampling_logp_difference/mean": 0.016552705317735672, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 213.671875, "completions/mean_terminated_length": 213.671875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.24672749638557434, "epoch": 0.38480392156862747, "frac_reward_zero_std": 1.0, "grad_norm": 0.038710724119157124, "kl": 0.03980868682265282, "learning_rate": 9.977236325942497e-07, "loss": 0.0004, "num_tokens": 9779444.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7142006158828735, "sampling/importance_sampling_ratio/mean": 0.9995484948158264, "sampling/importance_sampling_ratio/min": 0.5472590327262878, "sampling/sampling_logp_difference/max": 0.6028330326080322, "sampling/sampling_logp_difference/mean": 0.014922859147191048, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 203.515625, "completions/mean_terminated_length": 203.515625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3034396767616272, "epoch": 0.3860294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.4020050860954343, "kl": 0.0434013307094574, "learning_rate": 9.97655227785349e-07, "loss": -0.003, "num_tokens": 9809765.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5903964042663574, "sampling/importance_sampling_ratio/mean": 1.0003769397735596, "sampling/importance_sampling_ratio/min": 0.4204670190811157, "sampling/sampling_logp_difference/max": 0.866389274597168, "sampling/sampling_logp_difference/mean": 0.016428284347057343, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 223.921875, "completions/mean_terminated_length": 223.921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22055089473724365, "epoch": 0.3872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.06897500649119237, "kl": 0.04714478179812431, "learning_rate": 9.975858127678633e-07, "loss": 0.0004, "num_tokens": 9843376.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.663209319114685, "sampling/importance_sampling_ratio/mean": 1.000046730041504, "sampling/importance_sampling_ratio/min": 0.6187101006507874, "sampling/sampling_logp_difference/max": 0.5087490081787109, "sampling/sampling_logp_difference/mean": 0.01338781975209713, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 186.484375, "completions/mean_terminated_length": 186.484375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.28117749094963074, "epoch": 0.38848039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 1.1723249204037125, "kl": 0.06439540535211563, "learning_rate": 9.975153876827007e-07, "loss": 0.0153, "num_tokens": 9873151.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6037282943725586, "sampling/importance_sampling_ratio/mean": 0.9997469186782837, "sampling/importance_sampling_ratio/min": 0.262337863445282, "sampling/sampling_logp_difference/max": 1.338122010231018, "sampling/sampling_logp_difference/mean": 0.01560327596962452, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 174.953125, "completions/mean_terminated_length": 174.953125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.1813991814851761, "epoch": 0.3897058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.09730586734867665, "kl": 0.049150414764881134, "learning_rate": 9.974439526728196e-07, "loss": 0.0005, "num_tokens": 9902892.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8558018207550049, "sampling/importance_sampling_ratio/mean": 0.999718427658081, "sampling/importance_sampling_ratio/min": 0.5134551525115967, "sampling/sampling_logp_difference/max": 0.6665925979614258, "sampling/sampling_logp_difference/mean": 0.012492336332798004, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 184.515625, "completions/mean_terminated_length": 184.515625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.25537946820259094, "epoch": 0.3909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.048250038071933725, "kl": 0.04358714073896408, "learning_rate": 9.973715078832286e-07, "loss": 0.0004, "num_tokens": 9930685.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6917471885681152, "sampling/importance_sampling_ratio/mean": 1.0004292726516724, "sampling/importance_sampling_ratio/min": 0.4916207194328308, "sampling/sampling_logp_difference/max": 0.710047721862793, "sampling/sampling_logp_difference/mean": 0.015268863178789616, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 143.734375, "completions/mean_terminated_length": 143.734375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.23418602347373962, "epoch": 0.39215686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.08301223920073554, "kl": 0.06582756340503693, "learning_rate": 9.97298053460986e-07, "loss": 0.0006, "num_tokens": 9958860.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7450684309005737, "sampling/importance_sampling_ratio/mean": 0.999579668045044, "sampling/importance_sampling_ratio/min": 0.17664504051208496, "sampling/sampling_logp_difference/max": 1.7336130142211914, "sampling/sampling_logp_difference/mean": 0.014755094423890114, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.213094100356102, "epoch": 0.39338235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.07286969598929685, "kl": 0.06171388551592827, "learning_rate": 9.972235895552e-07, "loss": 0.0006, "num_tokens": 9983716.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.597754716873169, "sampling/importance_sampling_ratio/mean": 1.000230312347412, "sampling/importance_sampling_ratio/min": 0.4993267059326172, "sampling/sampling_logp_difference/max": 0.6944947242736816, "sampling/sampling_logp_difference/mean": 0.013996414840221405, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 182.765625, "completions/mean_terminated_length": 182.765625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.28134018182754517, "epoch": 0.3946078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.3561961079152003, "kl": 0.07659714668989182, "learning_rate": 9.971481163170269e-07, "loss": 0.0083, "num_tokens": 10016693.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996200203895569, "sampling/importance_sampling_ratio/min": 0.15386080741882324, "sampling/sampling_logp_difference/max": 1.8717069625854492, "sampling/sampling_logp_difference/mean": 0.017205171287059784, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 240.359375, "completions/mean_terminated_length": 240.359375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.19475317001342773, "epoch": 0.3958333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03885232754500029, "kl": 0.039011888206005096, "learning_rate": 9.97071633899673e-07, "loss": 0.0004, "num_tokens": 10047308.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9393295049667358, "sampling/importance_sampling_ratio/mean": 0.9997988343238831, "sampling/importance_sampling_ratio/min": 0.4440854787826538, "sampling/sampling_logp_difference/max": 0.8117382526397705, "sampling/sampling_logp_difference/mean": 0.011159185320138931, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 177.46875, "completions/mean_terminated_length": 177.46875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.26126256585121155, "epoch": 0.39705882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.056674788480339515, "kl": 0.05115184187889099, "learning_rate": 9.969941424583925e-07, "loss": 0.0005, "num_tokens": 10079914.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8065956830978394, "sampling/importance_sampling_ratio/mean": 0.9991174936294556, "sampling/importance_sampling_ratio/min": 0.3691619634628296, "sampling/sampling_logp_difference/max": 0.9965198040008545, "sampling/sampling_logp_difference/mean": 0.017451079562306404, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 162.140625, "completions/mean_terminated_length": 162.140625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.28593000769615173, "epoch": 0.39828431372549017, "frac_reward_zero_std": 1.0, "grad_norm": 0.16347853531304926, "kl": 0.06763357669115067, "learning_rate": 9.969156421504887e-07, "loss": 0.0006, "num_tokens": 10109251.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9765565395355225, "sampling/importance_sampling_ratio/mean": 1.0008492469787598, "sampling/importance_sampling_ratio/min": 0.22377948462963104, "sampling/sampling_logp_difference/max": 1.4970941543579102, "sampling/sampling_logp_difference/mean": 0.01835448667407036, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 142.890625, "completions/mean_terminated_length": 142.890625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.237276092171669, "epoch": 0.39950980392156865, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060328620195097, "kl": 0.06441694498062134, "learning_rate": 9.968361331353116e-07, "loss": 0.0006, "num_tokens": 10132060.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6106983423233032, "sampling/importance_sampling_ratio/mean": 1.000266671180725, "sampling/importance_sampling_ratio/min": 0.22330020368099213, "sampling/sampling_logp_difference/max": 1.4992382526397705, "sampling/sampling_logp_difference/mean": 0.014961745589971542, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 179.078125, "completions/mean_terminated_length": 179.078125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.29044216871261597, "epoch": 0.4007352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.061437390410855684, "kl": 0.06264321506023407, "learning_rate": 9.9675561557426e-07, "loss": 0.0006, "num_tokens": 10164001.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.732211947441101, "sampling/importance_sampling_ratio/mean": 0.9993464946746826, "sampling/importance_sampling_ratio/min": 0.47762712836265564, "sampling/sampling_logp_difference/max": 0.7389249801635742, "sampling/sampling_logp_difference/mean": 0.017121639102697372, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 197.640625, "completions/mean_terminated_length": 197.640625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.28534969687461853, "epoch": 0.4019607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.551587661231534, "kl": 0.05487293004989624, "learning_rate": 9.966740896307791e-07, "loss": 0.0133, "num_tokens": 10196522.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6065754890441895, "sampling/importance_sampling_ratio/mean": 1.0002343654632568, "sampling/importance_sampling_ratio/min": 0.5187581181526184, "sampling/sampling_logp_difference/max": 0.6563175320625305, "sampling/sampling_logp_difference/mean": 0.01480232086032629, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 187.9375, "completions/mean_terminated_length": 187.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23456662893295288, "epoch": 0.40318627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.0676532720442417, "kl": 0.05448365956544876, "learning_rate": 9.965915554703613e-07, "loss": 0.0005, "num_tokens": 10222662.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6160944700241089, "sampling/importance_sampling_ratio/mean": 0.9998668432235718, "sampling/importance_sampling_ratio/min": 0.6147313714027405, "sampling/sampling_logp_difference/max": 0.486569881439209, "sampling/sampling_logp_difference/mean": 0.012745749205350876, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 185.9375, "completions/mean_terminated_length": 185.9375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.28205636143684387, "epoch": 0.40441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.04557906960290233, "kl": 0.04451793432235718, "learning_rate": 9.965080132605461e-07, "loss": 0.0004, "num_tokens": 10252466.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7039932012557983, "sampling/importance_sampling_ratio/mean": 0.9997226595878601, "sampling/importance_sampling_ratio/min": 0.6168078780174255, "sampling/sampling_logp_difference/max": 0.5329744815826416, "sampling/sampling_logp_difference/mean": 0.014937239699065685, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24828140437602997, "epoch": 0.4056372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.06048318618681636, "kl": 0.043864063918590546, "learning_rate": 9.964234631709185e-07, "loss": 0.0004, "num_tokens": 10284970.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997553825378418, "sampling/importance_sampling_ratio/min": 0.5263671278953552, "sampling/sampling_logp_difference/max": 0.7147336006164551, "sampling/sampling_logp_difference/mean": 0.015005389228463173, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 211.484375, "completions/mean_terminated_length": 211.484375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.21704155206680298, "epoch": 0.4068627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.04417146396250178, "kl": 0.046004392206668854, "learning_rate": 9.963379053731102e-07, "loss": 0.0004, "num_tokens": 10314825.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999356269836426, "sampling/importance_sampling_ratio/min": 0.5363428592681885, "sampling/sampling_logp_difference/max": 0.7130594253540039, "sampling/sampling_logp_difference/mean": 0.01378547865897417, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 167.921875, "completions/mean_terminated_length": 167.921875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2341511845588684, "epoch": 0.40808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.05261681014748957, "kl": 0.04148735851049423, "learning_rate": 9.96251340040798e-07, "loss": 0.0004, "num_tokens": 10342644.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.794343113899231, "sampling/importance_sampling_ratio/mean": 1.0001440048217773, "sampling/importance_sampling_ratio/min": 0.6222392916679382, "sampling/sampling_logp_difference/max": 0.5846390724182129, "sampling/sampling_logp_difference/mean": 0.014835352078080177, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 163.359375, "completions/mean_terminated_length": 163.359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.24428749084472656, "epoch": 0.40931372549019607, "frac_reward_zero_std": 1.0, "grad_norm": 0.05334254161930812, "kl": 0.048776090145111084, "learning_rate": 9.96163767349704e-07, "loss": 0.0005, "num_tokens": 10373755.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9219019412994385, "sampling/importance_sampling_ratio/mean": 1.0004420280456543, "sampling/importance_sampling_ratio/min": 0.4739871621131897, "sampling/sampling_logp_difference/max": 0.7465750575065613, "sampling/sampling_logp_difference/mean": 0.015613086521625519, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 235.828125, "completions/mean_terminated_length": 235.828125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.31158655881881714, "epoch": 0.4105392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.08802832666554632, "kl": 0.05691176652908325, "learning_rate": 9.96075187477595e-07, "loss": 0.0005, "num_tokens": 10406544.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6408649682998657, "sampling/importance_sampling_ratio/mean": 0.9993789792060852, "sampling/importance_sampling_ratio/min": 0.4982425272464752, "sampling/sampling_logp_difference/max": 0.6966683864593506, "sampling/sampling_logp_difference/mean": 0.017097072675824165, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 137.703125, "completions/mean_terminated_length": 137.703125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.19279997050762177, "epoch": 0.4117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494635031712655, "kl": 0.05286743491888046, "learning_rate": 9.959856006042828e-07, "loss": 0.0005, "num_tokens": 10433357.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7472764253616333, "sampling/importance_sampling_ratio/mean": 0.9991959929466248, "sampling/importance_sampling_ratio/min": 0.5327165722846985, "sampling/sampling_logp_difference/max": 0.6297657489776611, "sampling/sampling_logp_difference/mean": 0.011947166174650192, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 136.484375, "completions/mean_terminated_length": 136.484375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2865660488605499, "epoch": 0.41299019607843135, "frac_reward_zero_std": 1.0, "grad_norm": 0.15011147787428938, "kl": 0.068015456199646, "learning_rate": 9.95895006911623e-07, "loss": 0.0007, "num_tokens": 10463676.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996687769889832, "sampling/importance_sampling_ratio/min": 0.13205653429031372, "sampling/sampling_logp_difference/max": 2.0245251655578613, "sampling/sampling_logp_difference/mean": 0.02037999778985977, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 220.828125, "completions/mean_terminated_length": 220.828125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.22336722910404205, "epoch": 0.41421568627450983, "frac_reward_zero_std": 1.0, "grad_norm": 0.041152110769261244, "kl": 0.029732730239629745, "learning_rate": 9.95803406583515e-07, "loss": 0.0003, "num_tokens": 10491585.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6976975202560425, "sampling/importance_sampling_ratio/mean": 0.9999824166297913, "sampling/importance_sampling_ratio/min": 0.5235568284988403, "sampling/sampling_logp_difference/max": 0.6471096277236938, "sampling/sampling_logp_difference/mean": 0.013062494806945324, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 168.59375, "completions/mean_terminated_length": 168.59375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.29412657022476196, "epoch": 0.41544117647058826, "frac_reward_zero_std": 0.5, "grad_norm": 2.3569155139365905, "kl": 0.08087459206581116, "learning_rate": 9.957107998059018e-07, "loss": -0.0774, "num_tokens": 10518007.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.637667179107666, "sampling/importance_sampling_ratio/mean": 0.9994561672210693, "sampling/importance_sampling_ratio/min": 0.6117925047874451, "sampling/sampling_logp_difference/max": 0.4932727813720703, "sampling/sampling_logp_difference/mean": 0.017368581146001816, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 178.140625, "completions/mean_terminated_length": 178.140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.28629130125045776, "epoch": 0.4166666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 1.6632228991561313, "kl": 0.05552738159894943, "learning_rate": 9.956171867667693e-07, "loss": -0.0419, "num_tokens": 10552560.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6182644367218018, "sampling/importance_sampling_ratio/mean": 0.9997725486755371, "sampling/importance_sampling_ratio/min": 0.5164028406143188, "sampling/sampling_logp_difference/max": 0.6608681678771973, "sampling/sampling_logp_difference/mean": 0.01721036247909069, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 142.046875, "completions/mean_terminated_length": 142.046875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.253487765789032, "epoch": 0.4178921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.07404755502753235, "kl": 0.04503706842660904, "learning_rate": 9.955225676561459e-07, "loss": 0.0005, "num_tokens": 10574387.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6683697700500488, "sampling/importance_sampling_ratio/mean": 0.9993175268173218, "sampling/importance_sampling_ratio/min": 0.39866963028907776, "sampling/sampling_logp_difference/max": 0.9196221828460693, "sampling/sampling_logp_difference/mean": 0.017505541443824768, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 203.265625, "completions/mean_terminated_length": 203.265625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.1682276874780655, "epoch": 0.41911764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.040842604358258025, "kl": 0.028833290562033653, "learning_rate": 9.954269426661022e-07, "loss": 0.0003, "num_tokens": 10611556.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.0004328489303589, "sampling/importance_sampling_ratio/min": 0.512783944606781, "sampling/sampling_logp_difference/max": 0.6679006814956665, "sampling/sampling_logp_difference/mean": 0.01228379923850298, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 156.59375, "completions/mean_terminated_length": 156.59375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2435804307460785, "epoch": 0.42034313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.9371516107086189, "kl": 0.04949047416448593, "learning_rate": 9.953303119907513e-07, "loss": -0.0202, "num_tokens": 10639210.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994317293167114, "sampling/importance_sampling_ratio/min": 0.264559268951416, "sampling/sampling_logp_difference/max": 1.3296899795532227, "sampling/sampling_logp_difference/mean": 0.01969146728515625, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 193.375, "completions/mean_terminated_length": 193.375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2623113989830017, "epoch": 0.4215686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.348681539886754, "kl": 0.049027375876903534, "learning_rate": 9.952326758262472e-07, "loss": 0.0203, "num_tokens": 10671058.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6695427894592285, "sampling/importance_sampling_ratio/mean": 1.0002408027648926, "sampling/importance_sampling_ratio/min": 0.5719989538192749, "sampling/sampling_logp_difference/max": 0.5586180686950684, "sampling/sampling_logp_difference/mean": 0.01638277992606163, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.13473373651504517, "epoch": 0.4227941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.031428930438624054, "kl": 0.017639823257923126, "learning_rate": 9.95134034370785e-07, "loss": 0.0002, "num_tokens": 10696410.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 1.0000739097595215, "sampling/importance_sampling_ratio/min": 0.3698100745677948, "sampling/sampling_logp_difference/max": 0.9947657585144043, "sampling/sampling_logp_difference/mean": 0.01015438698232174, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 172.171875, "completions/mean_terminated_length": 172.171875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.22618728876113892, "epoch": 0.42401960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 2.1376996765904877, "kl": 0.041539475321769714, "learning_rate": 9.950343878246009e-07, "loss": -0.0062, "num_tokens": 10733653.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9957953691482544, "sampling/importance_sampling_ratio/mean": 1.0007014274597168, "sampling/importance_sampling_ratio/min": 0.4055122137069702, "sampling/sampling_logp_difference/max": 0.902604341506958, "sampling/sampling_logp_difference/mean": 0.01611269637942314, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 247.140625, "completions/mean_terminated_length": 247.140625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2703923285007477, "epoch": 0.4252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.05173105447890271, "kl": 0.046294279396533966, "learning_rate": 9.949337363899708e-07, "loss": 0.0003, "num_tokens": 10765342.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7525776624679565, "sampling/importance_sampling_ratio/mean": 1.0000587701797485, "sampling/importance_sampling_ratio/min": 0.4304106533527374, "sampling/sampling_logp_difference/max": 0.8430154919624329, "sampling/sampling_logp_difference/mean": 0.016863878816366196, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 178.78125, "completions/mean_terminated_length": 178.78125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.26333087682724, "epoch": 0.4264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.09253528431740904, "kl": 0.05567295104265213, "learning_rate": 9.948320802712107e-07, "loss": 0.0005, "num_tokens": 10792064.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5458966493606567, "sampling/importance_sampling_ratio/mean": 1.000347375869751, "sampling/importance_sampling_ratio/min": 0.5169248580932617, "sampling/sampling_logp_difference/max": 0.6598577499389648, "sampling/sampling_logp_difference/mean": 0.016856186091899872, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 265.40625, "completions/mean_terminated_length": 265.40625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.19036869704723358, "epoch": 0.42769607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.029572162983203348, "kl": 0.01962047815322876, "learning_rate": 9.947294196746762e-07, "loss": 0.0002, "num_tokens": 10827850.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005722045898438, "sampling/importance_sampling_ratio/min": 0.474294513463974, "sampling/sampling_logp_difference/max": 1.0838651657104492, "sampling/sampling_logp_difference/mean": 0.01202384103089571, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 176.140625, "completions/mean_terminated_length": 176.140625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.21644078195095062, "epoch": 0.42892156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.22621544381853376, "kl": 0.06590238213539124, "learning_rate": 9.946257548087619e-07, "loss": 0.0005, "num_tokens": 10854979.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7846208810806274, "sampling/importance_sampling_ratio/mean": 0.9999986290931702, "sampling/importance_sampling_ratio/min": 0.390634685754776, "sampling/sampling_logp_difference/max": 0.9399824738502502, "sampling/sampling_logp_difference/mean": 0.01423841156065464, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 186.8125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.17167896032333374, "epoch": 0.43014705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.05660215114493649, "kl": 0.024589000269770622, "learning_rate": 9.945210858839008e-07, "loss": 0.0002, "num_tokens": 10883751.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6356914043426514, "sampling/importance_sampling_ratio/mean": 1.0000401735305786, "sampling/importance_sampling_ratio/min": 0.5330818891525269, "sampling/sampling_logp_difference/max": 0.6290802955627441, "sampling/sampling_logp_difference/mean": 0.012451952323317528, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 280.234375, "completions/mean_terminated_length": 280.234375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.22605139017105103, "epoch": 0.43137254901960786, "frac_reward_zero_std": 0.75, "grad_norm": 1.170995796907789, "kl": 0.030084867030382156, "learning_rate": 9.944154131125642e-07, "loss": 0.0181, "num_tokens": 10919542.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995243549346924, "sampling/importance_sampling_ratio/min": 0.11731626838445663, "sampling/sampling_logp_difference/max": 2.1428818702697754, "sampling/sampling_logp_difference/mean": 0.014059376902878284, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 242.21875, "completions/mean_terminated_length": 242.21875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.23910082876682281, "epoch": 0.4325980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.5118189053638529, "kl": 0.03150876611471176, "learning_rate": 9.94308736709261e-07, "loss": 0.067, "num_tokens": 10955588.0, "reward": -0.34375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001499652862549, "sampling/importance_sampling_ratio/min": 0.5876941084861755, "sampling/sampling_logp_difference/max": 0.9266014099121094, "sampling/sampling_logp_difference/mean": 0.015095872804522514, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 210.8125, "completions/mean_terminated_length": 210.8125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.22580724954605103, "epoch": 0.4338235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 2.348216584589421, "kl": 0.02664954587817192, "learning_rate": 9.94201056890538e-07, "loss": -0.0162, "num_tokens": 10986296.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996037483215332, "sampling/importance_sampling_ratio/min": 0.4866310954093933, "sampling/sampling_logp_difference/max": 0.7321920394897461, "sampling/sampling_logp_difference/mean": 0.015312884002923965, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 283.90625, "completions/mean_terminated_length": 283.90625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.28491437435150146, "epoch": 0.43504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.2733266904092038, "kl": 0.026450350880622864, "learning_rate": 9.940923738749777e-07, "loss": -0.0108, "num_tokens": 11021714.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.79961359500885, "sampling/importance_sampling_ratio/mean": 1.0002620220184326, "sampling/importance_sampling_ratio/min": 0.4311363995075226, "sampling/sampling_logp_difference/max": 0.8413307666778564, "sampling/sampling_logp_difference/mean": 0.016569074243307114, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 222.71875, "completions/mean_terminated_length": 222.71875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.25121283531188965, "epoch": 0.4362745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.04937892096437045, "kl": 0.026176000013947487, "learning_rate": 9.939826878832003e-07, "loss": 0.0003, "num_tokens": 11051440.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6256892681121826, "sampling/importance_sampling_ratio/mean": 0.9997677803039551, "sampling/importance_sampling_ratio/min": 0.3972400426864624, "sampling/sampling_logp_difference/max": 0.9232145547866821, "sampling/sampling_logp_difference/mean": 0.016339467838406563, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 240.640625, "completions/mean_terminated_length": 240.640625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.32118406891822815, "epoch": 0.4375, "frac_reward_zero_std": 0.25, "grad_norm": 2.1057094242743686, "kl": 0.03562553972005844, "learning_rate": 9.938719991378613e-07, "loss": -0.0342, "num_tokens": 11088825.0, "reward": 0.28125, "reward_std": 0.659286618232727, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5646177530288696, "sampling/importance_sampling_ratio/mean": 0.9996243119239807, "sampling/importance_sampling_ratio/min": 0.4814416170120239, "sampling/sampling_logp_difference/max": 0.7309703826904297, "sampling/sampling_logp_difference/mean": 0.01743035763502121, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 198.3125, "completions/mean_terminated_length": 198.3125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.28648340702056885, "epoch": 0.4387254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.5807774770939829, "kl": 0.04289580509066582, "learning_rate": 9.937603078636518e-07, "loss": 0.0005, "num_tokens": 11127069.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991505742073059, "sampling/importance_sampling_ratio/min": 0.22127984464168549, "sampling/sampling_logp_difference/max": 1.5083271265029907, "sampling/sampling_logp_difference/mean": 0.0200237687677145, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 163.546875, "completions/mean_terminated_length": 163.546875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.22636833786964417, "epoch": 0.43995098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 2.298762577698692, "kl": 0.05313120782375336, "learning_rate": 9.936476142872977e-07, "loss": -0.0079, "num_tokens": 11151088.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.806626319885254, "sampling/importance_sampling_ratio/mean": 1.0003206729888916, "sampling/importance_sampling_ratio/min": 0.48932701349258423, "sampling/sampling_logp_difference/max": 0.7147243022918701, "sampling/sampling_logp_difference/mean": 0.015751542523503304, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 207.84375, "completions/mean_terminated_length": 207.84375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.26953721046447754, "epoch": 0.4411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.6749487824217606, "kl": 0.0324811227619648, "learning_rate": 9.935339186375603e-07, "loss": 0.0037, "num_tokens": 11185222.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6977006196975708, "sampling/importance_sampling_ratio/mean": 1.000344157218933, "sampling/importance_sampling_ratio/min": 0.4160033166408539, "sampling/sampling_logp_difference/max": 0.8770620226860046, "sampling/sampling_logp_difference/mean": 0.016557641327381134, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 296.03125, "completions/mean_terminated_length": 296.03125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2822401821613312, "epoch": 0.4424019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.5389362004608935, "kl": 0.025479158386588097, "learning_rate": 9.934192211452344e-07, "loss": -0.0208, "num_tokens": 11230536.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7178759574890137, "sampling/importance_sampling_ratio/mean": 0.9992885589599609, "sampling/importance_sampling_ratio/min": 0.35823339223861694, "sampling/sampling_logp_difference/max": 1.0265705585479736, "sampling/sampling_logp_difference/mean": 0.015677716583013535, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 282.828125, "completions/mean_terminated_length": 282.828125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.24845105409622192, "epoch": 0.44362745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.06160487922193275, "kl": 0.02625298500061035, "learning_rate": 9.933035220431487e-07, "loss": 0.0002, "num_tokens": 11266669.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5616941452026367, "sampling/importance_sampling_ratio/mean": 0.9996004104614258, "sampling/importance_sampling_ratio/min": 0.47252514958381653, "sampling/sampling_logp_difference/max": 0.749664306640625, "sampling/sampling_logp_difference/mean": 0.014867119491100311, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 213.8125, "completions/mean_terminated_length": 213.8125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.26533043384552, "epoch": 0.44485294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.9237355703924648, "kl": 0.04259977489709854, "learning_rate": 9.931868215661647e-07, "loss": -0.0115, "num_tokens": 11296673.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8399380445480347, "sampling/importance_sampling_ratio/mean": 0.9999451637268066, "sampling/importance_sampling_ratio/min": 0.4808300733566284, "sampling/sampling_logp_difference/max": 0.7322413921356201, "sampling/sampling_logp_difference/mean": 0.01594679430127144, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 209.96875, "completions/mean_terminated_length": 209.96875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.176650732755661, "epoch": 0.44607843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.04743493969795115, "kl": 0.02621782198548317, "learning_rate": 9.930691199511773e-07, "loss": 0.0003, "num_tokens": 11323151.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000335454940796, "sampling/importance_sampling_ratio/min": 0.5128016471862793, "sampling/sampling_logp_difference/max": 0.7265121936798096, "sampling/sampling_logp_difference/mean": 0.01218412071466446, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 256.109375, "completions/mean_terminated_length": 256.109375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.24810782074928284, "epoch": 0.44730392156862747, "frac_reward_zero_std": 0.5, "grad_norm": 1.7061983645548506, "kl": 0.04609496146440506, "learning_rate": 9.929504174371136e-07, "loss": -0.0122, "num_tokens": 11357430.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999706745147705, "sampling/importance_sampling_ratio/min": 0.45373624563217163, "sampling/sampling_logp_difference/max": 0.7902392148971558, "sampling/sampling_logp_difference/mean": 0.014929584227502346, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 223.984375, "completions/mean_terminated_length": 223.984375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2555466890335083, "epoch": 0.4485294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.3109607474275589, "kl": 0.03534568101167679, "learning_rate": 9.928307142649314e-07, "loss": 0.0079, "num_tokens": 11386405.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5941057205200195, "sampling/importance_sampling_ratio/mean": 1.000087022781372, "sampling/importance_sampling_ratio/min": 0.4885924458503723, "sampling/sampling_logp_difference/max": 0.7162265777587891, "sampling/sampling_logp_difference/mean": 0.01537355873733759, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 243.1875, "completions/mean_terminated_length": 243.1875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2532086968421936, "epoch": 0.4497549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.5523789955813703, "kl": 0.040189456194639206, "learning_rate": 9.927100106776212e-07, "loss": -0.0406, "num_tokens": 11416769.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001037120819092, "sampling/importance_sampling_ratio/min": 0.31536558270454407, "sampling/sampling_logp_difference/max": 1.1540226936340332, "sampling/sampling_logp_difference/mean": 0.01462498214095831, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 227.84375, "completions/mean_terminated_length": 227.84375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2768535912036896, "epoch": 0.45098039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 1.8081566949498855, "kl": 0.03865700215101242, "learning_rate": 9.925883069202034e-07, "loss": -0.1665, "num_tokens": 11449927.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.972188115119934, "sampling/importance_sampling_ratio/mean": 0.9997962117195129, "sampling/importance_sampling_ratio/min": 0.4022789001464844, "sampling/sampling_logp_difference/max": 0.9106096029281616, "sampling/sampling_logp_difference/mean": 0.01726974919438362, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 319.421875, "completions/mean_terminated_length": 319.421875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.28473591804504395, "epoch": 0.4522058823529412, "frac_reward_zero_std": 0.75, "grad_norm": 0.8438301060346188, "kl": 0.0417214035987854, "learning_rate": 9.92465603239729e-07, "loss": -0.0117, "num_tokens": 11488962.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0001962184906006, "sampling/importance_sampling_ratio/min": 0.1541740596294403, "sampling/sampling_logp_difference/max": 1.8696730136871338, "sampling/sampling_logp_difference/mean": 0.015511645935475826, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2947255074977875, "epoch": 0.4534313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.2613504387461651, "kl": 0.040817294269800186, "learning_rate": 9.923418998852787e-07, "loss": 0.0047, "num_tokens": 11519922.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000306367874146, "sampling/importance_sampling_ratio/min": 0.35686278343200684, "sampling/sampling_logp_difference/max": 1.030403971672058, "sampling/sampling_logp_difference/mean": 0.016571637243032455, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 209.4375, "completions/mean_terminated_length": 209.4375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2516079545021057, "epoch": 0.45465686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.061096885384244876, "kl": 0.03814944624900818, "learning_rate": 9.922171971079622e-07, "loss": 0.0004, "num_tokens": 11549982.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.995097041130066, "sampling/importance_sampling_ratio/mean": 1.0001697540283203, "sampling/importance_sampling_ratio/min": 0.5281258821487427, "sampling/sampling_logp_difference/max": 0.690692663192749, "sampling/sampling_logp_difference/mean": 0.014308227226138115, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 227.4375, "completions/mean_terminated_length": 227.4375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.28589046001434326, "epoch": 0.45588235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.6545090252137973, "kl": 0.062388550490140915, "learning_rate": 9.920914951609186e-07, "loss": 0.077, "num_tokens": 11582650.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5279115438461304, "sampling/importance_sampling_ratio/mean": 0.999853789806366, "sampling/importance_sampling_ratio/min": 0.5709834694862366, "sampling/sampling_logp_difference/max": 0.5603950023651123, "sampling/sampling_logp_difference/mean": 0.013815401121973991, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 211.4375, "completions/mean_terminated_length": 211.4375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2030314952135086, "epoch": 0.4571078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.05725399699741107, "kl": 0.03566918522119522, "learning_rate": 9.919647942993147e-07, "loss": 0.0003, "num_tokens": 11614246.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6054738759994507, "sampling/importance_sampling_ratio/mean": 0.9997533559799194, "sampling/importance_sampling_ratio/min": 0.5426429510116577, "sampling/sampling_logp_difference/max": 0.6113038063049316, "sampling/sampling_logp_difference/mean": 0.013296281918883324, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 268.515625, "completions/mean_terminated_length": 268.515625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2816750109195709, "epoch": 0.4583333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 1.3493486641771302, "kl": 0.05643295496702194, "learning_rate": 9.918370947803455e-07, "loss": -0.0437, "num_tokens": 11654951.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4764822721481323, "sampling/importance_sampling_ratio/mean": 1.0000238418579102, "sampling/importance_sampling_ratio/min": 0.6057345867156982, "sampling/sampling_logp_difference/max": 0.5013134479522705, "sampling/sampling_logp_difference/mean": 0.0143747478723526, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 240.703125, "completions/mean_terminated_length": 240.703125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2940741181373596, "epoch": 0.45955882352941174, "frac_reward_zero_std": 0.5, "grad_norm": 1.6227521491911283, "kl": 0.06486812233924866, "learning_rate": 9.917083968632326e-07, "loss": 0.008, "num_tokens": 11685780.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001022815704346, "sampling/importance_sampling_ratio/min": 0.5026050209999084, "sampling/sampling_logp_difference/max": 0.732917308807373, "sampling/sampling_logp_difference/mean": 0.016473714262247086, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 225.53125, "completions/mean_terminated_length": 225.53125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.39877572655677795, "epoch": 0.46078431372549017, "frac_reward_zero_std": 0.25, "grad_norm": 2.2824686087004187, "kl": 0.08406936377286911, "learning_rate": 9.915787008092246e-07, "loss": 0.0294, "num_tokens": 11722630.0, "reward": 0.125, "reward_std": 0.6047805547714233, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9694862365722656, "sampling/importance_sampling_ratio/mean": 1.000741958618164, "sampling/importance_sampling_ratio/min": 0.5329796075820923, "sampling/sampling_logp_difference/max": 0.6777727603912354, "sampling/sampling_logp_difference/mean": 0.01972278580069542, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 184.296875, "completions/mean_terminated_length": 184.296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.25606122612953186, "epoch": 0.46200980392156865, "frac_reward_zero_std": 1.0, "grad_norm": 0.06060053819807976, "kl": 0.04585852473974228, "learning_rate": 9.914480068815961e-07, "loss": 0.0005, "num_tokens": 11754329.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6166514158248901, "sampling/importance_sampling_ratio/mean": 1.0003126859664917, "sampling/importance_sampling_ratio/min": 0.17059476673603058, "sampling/sampling_logp_difference/max": 1.7684643268585205, "sampling/sampling_logp_difference/mean": 0.016700156033039093, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 219.765625, "completions/mean_terminated_length": 219.765625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.22416208684444427, "epoch": 0.4632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.05300771080766106, "kl": 0.03702693060040474, "learning_rate": 9.913163153456482e-07, "loss": 0.0004, "num_tokens": 11783162.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6099443435668945, "sampling/importance_sampling_ratio/mean": 0.999893844127655, "sampling/importance_sampling_ratio/min": 0.46763360500335693, "sampling/sampling_logp_difference/max": 0.7600702047348022, "sampling/sampling_logp_difference/mean": 0.01346520520746708, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 296.453125, "completions/mean_terminated_length": 296.453125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.19604724645614624, "epoch": 0.4644607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.04359660214057077, "kl": 0.03180554509162903, "learning_rate": 9.91183626468706e-07, "loss": 0.0003, "num_tokens": 11821095.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071579217910767, "sampling/importance_sampling_ratio/mean": 0.9998345375061035, "sampling/importance_sampling_ratio/min": 0.573077380657196, "sampling/sampling_logp_difference/max": 0.556734561920166, "sampling/sampling_logp_difference/mean": 0.011631621047854424, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 230.515625, "completions/mean_terminated_length": 230.515625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.33643507957458496, "epoch": 0.46568627450980393, "frac_reward_zero_std": 0.75, "grad_norm": 0.8763197746933978, "kl": 0.05861668288707733, "learning_rate": 9.910499405201193e-07, "loss": 0.0095, "num_tokens": 11854712.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.613815426826477, "sampling/importance_sampling_ratio/mean": 0.9995595216751099, "sampling/importance_sampling_ratio/min": 0.1923624575138092, "sampling/sampling_logp_difference/max": 1.6483738422393799, "sampling/sampling_logp_difference/mean": 0.01788877323269844, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 173.984375, "completions/mean_terminated_length": 173.984375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2999635636806488, "epoch": 0.46691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.07982167416818635, "kl": 0.06704611331224442, "learning_rate": 9.909152577712625e-07, "loss": 0.0006, "num_tokens": 11880535.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5210472345352173, "sampling/importance_sampling_ratio/mean": 1.0001461505889893, "sampling/importance_sampling_ratio/min": 0.48961183428764343, "sampling/sampling_logp_difference/max": 0.7141423225402832, "sampling/sampling_logp_difference/mean": 0.018397457897663116, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 190.578125, "completions/mean_terminated_length": 190.578125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.19606657326221466, "epoch": 0.4681372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.06131105622796064, "kl": 0.045415766537189484, "learning_rate": 9.907795784955326e-07, "loss": 0.0005, "num_tokens": 11909372.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.742784023284912, "sampling/importance_sampling_ratio/mean": 0.9999132752418518, "sampling/importance_sampling_ratio/min": 0.32741686701774597, "sampling/sampling_logp_difference/max": 1.1165211200714111, "sampling/sampling_logp_difference/mean": 0.013729160651564598, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 304.453125, "completions/mean_terminated_length": 304.453125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.38282719254493713, "epoch": 0.4693627450980392, "frac_reward_zero_std": 0.25, "grad_norm": 1.8662681575337112, "kl": 0.0544503778219223, "learning_rate": 9.906429029683504e-07, "loss": -0.026, "num_tokens": 11946009.0, "reward": 0.0625, "reward_std": 0.6645200252532959, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001490116119385, "sampling/importance_sampling_ratio/min": 0.5103266835212708, "sampling/sampling_logp_difference/max": 0.7706844806671143, "sampling/sampling_logp_difference/mean": 0.016581425443291664, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 232.640625, "completions/mean_terminated_length": 232.640625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.27952274680137634, "epoch": 0.47058823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 1.5734908930850455, "kl": 0.050721801817417145, "learning_rate": 9.90505231467158e-07, "loss": -0.039, "num_tokens": 11983330.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998074769973755, "sampling/importance_sampling_ratio/min": 0.2890108525753021, "sampling/sampling_logp_difference/max": 1.8018252849578857, "sampling/sampling_logp_difference/mean": 0.018836161121726036, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 243.9375, "completions/mean_terminated_length": 243.9375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2589993476867676, "epoch": 0.47181372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 1.4847781689335693, "kl": 0.04911482334136963, "learning_rate": 9.903665642714204e-07, "loss": -0.0347, "num_tokens": 12016574.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6172823905944824, "sampling/importance_sampling_ratio/mean": 1.0000231266021729, "sampling/importance_sampling_ratio/min": 0.5771543979644775, "sampling/sampling_logp_difference/max": 0.5496454238891602, "sampling/sampling_logp_difference/mean": 0.01421155035495758, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 284.515625, "completions/mean_terminated_length": 284.515625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2348122000694275, "epoch": 0.4730392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.9464947450891469, "kl": 0.028523039072752, "learning_rate": 9.90226901662623e-07, "loss": -0.0021, "num_tokens": 12050415.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997332692146301, "sampling/importance_sampling_ratio/min": 0.4180172085762024, "sampling/sampling_logp_difference/max": 0.8722326755523682, "sampling/sampling_logp_difference/mean": 0.014374499209225178, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 198.8125, "completions/mean_terminated_length": 198.8125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2792658507823944, "epoch": 0.4742647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 1.2948043764887476, "kl": 0.044489506632089615, "learning_rate": 9.900862439242718e-07, "loss": 0.0248, "num_tokens": 12080835.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.934046745300293, "sampling/importance_sampling_ratio/mean": 0.9999051690101624, "sampling/importance_sampling_ratio/min": 0.4885071814060211, "sampling/sampling_logp_difference/max": 0.7164011001586914, "sampling/sampling_logp_difference/mean": 0.017194809392094612, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 179.6875, "completions/mean_terminated_length": 179.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.32544904947280884, "epoch": 0.47549019607843135, "frac_reward_zero_std": 0.75, "grad_norm": 1.455400063814383, "kl": 0.06876906752586365, "learning_rate": 9.899445913418935e-07, "loss": -0.0318, "num_tokens": 12113871.0, "reward": -0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5903608798980713, "sampling/importance_sampling_ratio/mean": 1.0007281303405762, "sampling/importance_sampling_ratio/min": 0.5240790843963623, "sampling/sampling_logp_difference/max": 0.6461126804351807, "sampling/sampling_logp_difference/mean": 0.019794166088104248, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 191.46875, "completions/mean_terminated_length": 191.46875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2183282971382141, "epoch": 0.47671568627450983, "frac_reward_zero_std": 1.0, "grad_norm": 0.05303185126965551, "kl": 0.03167909383773804, "learning_rate": 9.898019442030337e-07, "loss": 0.0003, "num_tokens": 12140765.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004873275756836, "sampling/importance_sampling_ratio/min": 0.6091670989990234, "sampling/sampling_logp_difference/max": 0.837608814239502, "sampling/sampling_logp_difference/mean": 0.013596564531326294, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 186.625, "completions/mean_terminated_length": 186.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3464588522911072, "epoch": 0.47794117647058826, "frac_reward_zero_std": 0.75, "grad_norm": 1.4594377353718966, "kl": 0.06954923272132874, "learning_rate": 9.89658302797257e-07, "loss": 0.0105, "num_tokens": 12166821.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.9741272926330566, "sampling/importance_sampling_ratio/mean": 1.0001782178878784, "sampling/importance_sampling_ratio/min": 0.48248496651649475, "sampling/sampling_logp_difference/max": 0.7288055419921875, "sampling/sampling_logp_difference/mean": 0.019971702247858047, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 227.65625, "completions/mean_terminated_length": 227.65625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.23684090375900269, "epoch": 0.4791666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.05047677743379039, "kl": 0.028256338089704514, "learning_rate": 9.895136674161464e-07, "loss": 0.0003, "num_tokens": 12198687.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00074303150177, "sampling/importance_sampling_ratio/min": 0.2696039378643036, "sampling/sampling_logp_difference/max": 1.3108012676239014, "sampling/sampling_logp_difference/mean": 0.01520511694252491, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 279.90625, "completions/mean_terminated_length": 279.90625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.30703943967819214, "epoch": 0.4803921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9915338354451154, "kl": 0.044222671538591385, "learning_rate": 9.893680383533024e-07, "loss": 0.0671, "num_tokens": 12236281.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6116913557052612, "sampling/importance_sampling_ratio/mean": 0.9999194145202637, "sampling/importance_sampling_ratio/min": 0.43365907669067383, "sampling/sampling_logp_difference/max": 0.8354966640472412, "sampling/sampling_logp_difference/mean": 0.016938215121626854, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 274.8125, "completions/mean_terminated_length": 274.8125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3057442009449005, "epoch": 0.48161764705882354, "frac_reward_zero_std": 0.25, "grad_norm": 1.7707064477493701, "kl": 0.07207056879997253, "learning_rate": 9.892214159043433e-07, "loss": -0.0496, "num_tokens": 12273821.0, "reward": 0.75, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7151470184326172, "sampling/importance_sampling_ratio/mean": 0.9999011754989624, "sampling/importance_sampling_ratio/min": 0.4128479063510895, "sampling/sampling_logp_difference/max": 0.8846759796142578, "sampling/sampling_logp_difference/mean": 0.01808691769838333, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 233.640625, "completions/mean_terminated_length": 233.640625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2136198878288269, "epoch": 0.48284313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.328368413009168, "kl": 0.03892651945352554, "learning_rate": 9.890738003669027e-07, "loss": 0.0535, "num_tokens": 12304358.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000007152557373, "sampling/importance_sampling_ratio/min": 0.4823680520057678, "sampling/sampling_logp_difference/max": 0.9336278438568115, "sampling/sampling_logp_difference/mean": 0.013449429534375668, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 245.71875, "completions/mean_terminated_length": 245.71875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3100472092628479, "epoch": 0.4840686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.6724824585110218, "kl": 0.04158332943916321, "learning_rate": 9.889251920406312e-07, "loss": -0.0308, "num_tokens": 12334996.0, "reward": -0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5585296154022217, "sampling/importance_sampling_ratio/mean": 1.000278115272522, "sampling/importance_sampling_ratio/min": 0.26434141397476196, "sampling/sampling_logp_difference/max": 1.3305137157440186, "sampling/sampling_logp_difference/mean": 0.01594216749072075, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 382.78125, "completions/mean_terminated_length": 382.78125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.18888866901397705, "epoch": 0.4852941176470588, "frac_reward_zero_std": 0.75, "grad_norm": 0.7714379917556299, "kl": 0.02613941766321659, "learning_rate": 9.887755912271942e-07, "loss": 0.0191, "num_tokens": 12376486.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996835589408875, "sampling/importance_sampling_ratio/min": 0.32474690675735474, "sampling/sampling_logp_difference/max": 1.124709129333496, "sampling/sampling_logp_difference/mean": 0.010552226565778255, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 251.859375, "completions/mean_terminated_length": 251.859375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2508004605770111, "epoch": 0.48651960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 1.227985971228998, "kl": 0.06145814433693886, "learning_rate": 9.886249982302718e-07, "loss": -0.0478, "num_tokens": 12410653.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000058889389038, "sampling/importance_sampling_ratio/min": 0.2936866283416748, "sampling/sampling_logp_difference/max": 1.2252418994903564, "sampling/sampling_logp_difference/mean": 0.016481686383485794, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 313.921875, "completions/mean_terminated_length": 313.921875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.28108274936676025, "epoch": 0.4877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.07062007147852627, "kl": 0.05387008190155029, "learning_rate": 9.884734133555585e-07, "loss": 0.0004, "num_tokens": 12447752.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000408411026001, "sampling/importance_sampling_ratio/min": 0.5632816553115845, "sampling/sampling_logp_difference/max": 0.9375678300857544, "sampling/sampling_logp_difference/mean": 0.01568533293902874, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 210.546875, "completions/mean_terminated_length": 210.546875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.21270164847373962, "epoch": 0.4889705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0750156076656718, "kl": 0.03275253623723984, "learning_rate": 9.883208369107617e-07, "loss": 0.0003, "num_tokens": 12477115.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.809498906135559, "sampling/importance_sampling_ratio/mean": 0.9995313882827759, "sampling/importance_sampling_ratio/min": 0.21648813784122467, "sampling/sampling_logp_difference/max": 1.530219554901123, "sampling/sampling_logp_difference/mean": 0.014600234106183052, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 207.140625, "completions/mean_terminated_length": 207.140625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.20452356338500977, "epoch": 0.49019607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.06209651334677291, "kl": 0.0376552939414978, "learning_rate": 9.88167269205602e-07, "loss": 0.0004, "num_tokens": 12503556.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8523972034454346, "sampling/importance_sampling_ratio/mean": 1.0001587867736816, "sampling/importance_sampling_ratio/min": 0.6124428510665894, "sampling/sampling_logp_difference/max": 0.6164805889129639, "sampling/sampling_logp_difference/mean": 0.012564984150230885, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 273.40625, "completions/mean_terminated_length": 273.40625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.20392152667045593, "epoch": 0.49142156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.05089353030210747, "kl": 0.02923227660357952, "learning_rate": 9.880127105518122e-07, "loss": 0.0003, "num_tokens": 12538766.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 1.0003801584243774, "sampling/importance_sampling_ratio/min": 0.37230610847473145, "sampling/sampling_logp_difference/max": 0.9880388975143433, "sampling/sampling_logp_difference/mean": 0.012940148822963238, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 166.734375, "completions/mean_terminated_length": 166.734375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.29518431425094604, "epoch": 0.49264705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 2.529219640519847, "kl": 0.05905640870332718, "learning_rate": 9.878571612631363e-07, "loss": 0.0531, "num_tokens": 12564701.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6979831457138062, "sampling/importance_sampling_ratio/mean": 0.9998425245285034, "sampling/importance_sampling_ratio/min": 0.5172002911567688, "sampling/sampling_logp_difference/max": 0.6593250036239624, "sampling/sampling_logp_difference/mean": 0.018709469586610794, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 193.671875, "completions/mean_terminated_length": 193.671875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2510262131690979, "epoch": 0.49387254901960786, "frac_reward_zero_std": 1.0, "grad_norm": 0.33292477718977276, "kl": 0.04922737926244736, "learning_rate": 9.8770062165533e-07, "loss": 0.0005, "num_tokens": 12599480.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5513496398925781, "sampling/importance_sampling_ratio/mean": 0.999625563621521, "sampling/importance_sampling_ratio/min": 0.30741992592811584, "sampling/sampling_logp_difference/max": 1.1795406341552734, "sampling/sampling_logp_difference/mean": 0.017182782292366028, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 145.90625, "completions/mean_terminated_length": 145.90625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.19823022186756134, "epoch": 0.4950980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.826081871373586, "kl": 0.0499572679400444, "learning_rate": 9.875430920461583e-07, "loss": 0.0142, "num_tokens": 12626114.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.693151593208313, "sampling/importance_sampling_ratio/mean": 0.9995004534721375, "sampling/importance_sampling_ratio/min": 0.23994095623493195, "sampling/sampling_logp_difference/max": 1.4273624420166016, "sampling/sampling_logp_difference/mean": 0.01597466692328453, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 181.0625, "completions/mean_terminated_length": 181.0625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.19514986872673035, "epoch": 0.4963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0834316994079156, "kl": 0.042915742844343185, "learning_rate": 9.873845727553965e-07, "loss": 0.0004, "num_tokens": 12660022.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8722196817398071, "sampling/importance_sampling_ratio/mean": 0.9999871850013733, "sampling/importance_sampling_ratio/min": 0.6210613250732422, "sampling/sampling_logp_difference/max": 0.6271247863769531, "sampling/sampling_logp_difference/mean": 0.013638028874993324, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 173.671875, "completions/mean_terminated_length": 173.671875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.23224231600761414, "epoch": 0.49754901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.20167192962952737, "kl": 0.04924897104501724, "learning_rate": 9.87225064104829e-07, "loss": 0.0005, "num_tokens": 12686065.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.815342903137207, "sampling/importance_sampling_ratio/mean": 0.99947190284729, "sampling/importance_sampling_ratio/min": 0.1088278517127037, "sampling/sampling_logp_difference/max": 2.2179880142211914, "sampling/sampling_logp_difference/mean": 0.016529075801372528, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 267.5, "completions/mean_terminated_length": 267.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.16993097960948944, "epoch": 0.4987745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.035463469020333434, "kl": 0.023991378024220467, "learning_rate": 9.870645664182476e-07, "loss": 0.0002, "num_tokens": 12719585.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999097585678101, "sampling/importance_sampling_ratio/min": 0.44757285714149475, "sampling/sampling_logp_difference/max": 1.5597548484802246, "sampling/sampling_logp_difference/mean": 0.011877389624714851, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 173.078125, "completions/mean_terminated_length": 173.078125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2073623687028885, "epoch": 0.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.06405288974078595, "kl": 0.0290830098092556, "learning_rate": 9.86903080021453e-07, "loss": 0.0003, "num_tokens": 12749046.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001136064529419, "sampling/importance_sampling_ratio/min": 0.42510080337524414, "sampling/sampling_logp_difference/max": 1.110346794128418, "sampling/sampling_logp_difference/mean": 0.01655196025967598, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 269.015625, "completions/mean_terminated_length": 269.015625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2286536544561386, "epoch": 0.5012254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.05105192718276332, "kl": 0.023631129413843155, "learning_rate": 9.867406052422523e-07, "loss": 0.0002, "num_tokens": 12788423.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997172355651855, "sampling/importance_sampling_ratio/min": 0.4074231684207916, "sampling/sampling_logp_difference/max": 0.9378218650817871, "sampling/sampling_logp_difference/mean": 0.015676937997341156, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 242.40625, "completions/mean_terminated_length": 242.40625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2490488588809967, "epoch": 0.5024509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.140636246592225, "kl": 0.04674581438302994, "learning_rate": 9.865771424104587e-07, "loss": 0.0026, "num_tokens": 12821025.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.804461121559143, "sampling/importance_sampling_ratio/mean": 1.000612497329712, "sampling/importance_sampling_ratio/min": 0.5682246088981628, "sampling/sampling_logp_difference/max": 0.5902619361877441, "sampling/sampling_logp_difference/mean": 0.014655455946922302, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 328.0625, "completions/mean_terminated_length": 328.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2791997492313385, "epoch": 0.5036764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.7274215866255332, "kl": 0.03656107187271118, "learning_rate": 9.864126918578919e-07, "loss": -0.0169, "num_tokens": 12860101.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.9620814323425293, "sampling/importance_sampling_ratio/mean": 0.9998489618301392, "sampling/importance_sampling_ratio/min": 0.27422481775283813, "sampling/sampling_logp_difference/max": 1.293807029724121, "sampling/sampling_logp_difference/mean": 0.017559055238962173, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 179.3125, "completions/mean_terminated_length": 179.3125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.1782495230436325, "epoch": 0.5049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.10630724203547498, "kl": 0.040607184171676636, "learning_rate": 9.862472539183755e-07, "loss": 0.0004, "num_tokens": 12887065.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9096622467041016, "sampling/importance_sampling_ratio/mean": 0.9999580979347229, "sampling/importance_sampling_ratio/min": 0.138926163315773, "sampling/sampling_logp_difference/max": 1.9738126993179321, "sampling/sampling_logp_difference/mean": 0.01566651463508606, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15380318462848663, "epoch": 0.5061274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.06374476952125667, "kl": 0.02124452218413353, "learning_rate": 9.860808289277385e-07, "loss": 0.0002, "num_tokens": 12915121.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002858638763428, "sampling/importance_sampling_ratio/min": 0.4787582457065582, "sampling/sampling_logp_difference/max": 0.8502476215362549, "sampling/sampling_logp_difference/mean": 0.012755339965224266, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 196.6875, "completions/mean_terminated_length": 196.6875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.18463312089443207, "epoch": 0.5073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.45936447945665526, "kl": 0.041250187903642654, "learning_rate": 9.859134172238128e-07, "loss": 0.0004, "num_tokens": 12942349.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.62086820602417, "sampling/importance_sampling_ratio/mean": 0.9995936155319214, "sampling/importance_sampling_ratio/min": 0.524817705154419, "sampling/sampling_logp_difference/max": 0.6447043418884277, "sampling/sampling_logp_difference/mean": 0.01295830775052309, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 186.421875, "completions/mean_terminated_length": 186.421875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1308806836605072, "epoch": 0.508578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.12053416482904825, "kl": 0.021573197096586227, "learning_rate": 9.857450191464337e-07, "loss": 0.0002, "num_tokens": 12970232.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999610424041748, "sampling/importance_sampling_ratio/min": 0.20379473268985748, "sampling/sampling_logp_difference/max": 1.590641975402832, "sampling/sampling_logp_difference/mean": 0.011412444524466991, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 177.6875, "completions/mean_terminated_length": 177.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.25102418661117554, "epoch": 0.5098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.05888729256866505, "kl": 0.04825008660554886, "learning_rate": 9.855756350374386e-07, "loss": 0.0005, "num_tokens": 13010228.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.742524266242981, "sampling/importance_sampling_ratio/mean": 0.9999663829803467, "sampling/importance_sampling_ratio/min": 0.374472051858902, "sampling/sampling_logp_difference/max": 0.9822380542755127, "sampling/sampling_logp_difference/mean": 0.018441712483763695, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 209.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.27332425117492676, "epoch": 0.5110294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.6965885955451983, "kl": 0.06332144141197205, "learning_rate": 9.854052652406665e-07, "loss": 0.0244, "num_tokens": 13044620.0, "reward": -0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.958435297012329, "sampling/importance_sampling_ratio/mean": 0.9991956949234009, "sampling/importance_sampling_ratio/min": 0.2559586465358734, "sampling/sampling_logp_difference/max": 1.3627393245697021, "sampling/sampling_logp_difference/mean": 0.01965285651385784, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 201.765625, "completions/mean_terminated_length": 201.765625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.20585131645202637, "epoch": 0.5122549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.4773288838893448, "kl": 0.028390567749738693, "learning_rate": 9.852339101019572e-07, "loss": -0.0381, "num_tokens": 13074701.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7199848890304565, "sampling/importance_sampling_ratio/mean": 0.9992534518241882, "sampling/importance_sampling_ratio/min": 0.3371962904930115, "sampling/sampling_logp_difference/max": 1.087090015411377, "sampling/sampling_logp_difference/mean": 0.013995368033647537, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 182.046875, "completions/mean_terminated_length": 182.046875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.19686485826969147, "epoch": 0.5134803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.04236162971608435, "kl": 0.036349326372146606, "learning_rate": 9.85061569969151e-07, "loss": 0.0003, "num_tokens": 13107600.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6127358675003052, "sampling/importance_sampling_ratio/mean": 1.0003676414489746, "sampling/importance_sampling_ratio/min": 0.4934726655483246, "sampling/sampling_logp_difference/max": 0.7062878608703613, "sampling/sampling_logp_difference/mean": 0.015297004953026772, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 187.765625, "completions/mean_terminated_length": 187.765625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.172816663980484, "epoch": 0.5147058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.04712307185938354, "kl": 0.026017405092716217, "learning_rate": 9.848882451920875e-07, "loss": 0.0002, "num_tokens": 13137985.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8718520402908325, "sampling/importance_sampling_ratio/mean": 0.9995954036712646, "sampling/importance_sampling_ratio/min": 0.02654479630291462, "sampling/sampling_logp_difference/max": 3.6289215087890625, "sampling/sampling_logp_difference/mean": 0.013399068266153336, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 181.640625, "completions/mean_terminated_length": 181.640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.1799655705690384, "epoch": 0.5159313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.1032649243091597, "kl": 0.03207855671644211, "learning_rate": 9.847139361226046e-07, "loss": -0.0172, "num_tokens": 13164330.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.611358642578125, "sampling/importance_sampling_ratio/mean": 1.000049114227295, "sampling/importance_sampling_ratio/min": 0.35595452785491943, "sampling/sampling_logp_difference/max": 1.0329523086547852, "sampling/sampling_logp_difference/mean": 0.01461087167263031, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 172.4375, "completions/mean_terminated_length": 172.4375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.20940491557121277, "epoch": 0.5171568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.07394043380277103, "kl": 0.03250215947628021, "learning_rate": 9.84538643114539e-07, "loss": 0.0003, "num_tokens": 13187846.0, "reward": -1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": -1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996784925460815, "sampling/importance_sampling_ratio/min": 0.5260618329048157, "sampling/sampling_logp_difference/max": 0.8738067150115967, "sampling/sampling_logp_difference/mean": 0.01687995158135891, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.1665087342262268, "epoch": 0.5183823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.08174180673496445, "kl": 0.037060923874378204, "learning_rate": 9.843623665237242e-07, "loss": 0.0004, "num_tokens": 13216654.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008127689361572, "sampling/importance_sampling_ratio/min": 0.4871967136859894, "sampling/sampling_logp_difference/max": 0.953502893447876, "sampling/sampling_logp_difference/mean": 0.014931373298168182, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 193.03125, "completions/mean_terminated_length": 193.03125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.18435706198215485, "epoch": 0.5196078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328042297588035, "kl": 0.02889874018728733, "learning_rate": 9.841851067079908e-07, "loss": 0.0003, "num_tokens": 13245504.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.91254460811615, "sampling/importance_sampling_ratio/mean": 0.9999415874481201, "sampling/importance_sampling_ratio/min": 0.2732740640640259, "sampling/sampling_logp_difference/max": 1.2972800731658936, "sampling/sampling_logp_difference/mean": 0.014272722415626049, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 166.640625, "completions/mean_terminated_length": 166.640625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2041907012462616, "epoch": 0.5208333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0727736574864572, "kl": 0.04394741728901863, "learning_rate": 9.840068640271647e-07, "loss": 0.0005, "num_tokens": 13271129.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000685453414917, "sampling/importance_sampling_ratio/min": 0.4440104067325592, "sampling/sampling_logp_difference/max": 0.8119072914123535, "sampling/sampling_logp_difference/mean": 0.015166521072387695, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 147.4375, "completions/mean_terminated_length": 147.4375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.148129403591156, "epoch": 0.5220588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.05450949523051405, "kl": 0.032044753432273865, "learning_rate": 9.838276388430675e-07, "loss": 0.0003, "num_tokens": 13295349.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6977241039276123, "sampling/importance_sampling_ratio/mean": 1.000300645828247, "sampling/importance_sampling_ratio/min": 0.6411559581756592, "sampling/sampling_logp_difference/max": 0.529288649559021, "sampling/sampling_logp_difference/mean": 0.012159367091953754, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 235.65625, "completions/mean_terminated_length": 235.65625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.23714864253997803, "epoch": 0.5232843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 2.2307582275952513, "kl": 0.05249298736453056, "learning_rate": 9.836474315195147e-07, "loss": 0.0382, "num_tokens": 13332767.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8917368650436401, "sampling/importance_sampling_ratio/mean": 1.0008130073547363, "sampling/importance_sampling_ratio/min": 0.44529467821121216, "sampling/sampling_logp_difference/max": 0.8090190887451172, "sampling/sampling_logp_difference/mean": 0.018667148426175117, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 133.59375, "completions/mean_terminated_length": 133.59375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.16025342047214508, "epoch": 0.5245098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.06014264560023584, "kl": 0.03190026804804802, "learning_rate": 9.83466242422316e-07, "loss": 0.0003, "num_tokens": 13356021.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001938343048096, "sampling/importance_sampling_ratio/min": 0.4700278043746948, "sampling/sampling_logp_difference/max": 0.7751011848449707, "sampling/sampling_logp_difference/mean": 0.012300319038331509, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 210.09375, "completions/mean_terminated_length": 210.09375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22527235746383667, "epoch": 0.5257352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.9659498041966834, "kl": 0.05424007773399353, "learning_rate": 9.832840719192735e-07, "loss": -0.0098, "num_tokens": 13388075.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001205205917358, "sampling/importance_sampling_ratio/min": 0.2378673404455185, "sampling/sampling_logp_difference/max": 1.4360421895980835, "sampling/sampling_logp_difference/mean": 0.017529528588056564, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 250.9375, "completions/mean_terminated_length": 250.9375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.21268804371356964, "epoch": 0.5269607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.040568360055927316, "kl": 0.03319639712572098, "learning_rate": 9.831009203801822e-07, "loss": 0.0002, "num_tokens": 13424455.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6312425136566162, "sampling/importance_sampling_ratio/mean": 0.9993811249732971, "sampling/importance_sampling_ratio/min": 0.5135435461997986, "sampling/sampling_logp_difference/max": 0.6664204597473145, "sampling/sampling_logp_difference/mean": 0.01416376419365406, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 238.734375, "completions/mean_terminated_length": 238.734375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.16849209368228912, "epoch": 0.5281862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0314614954354919, "kl": 0.024410758167505264, "learning_rate": 9.829167881768277e-07, "loss": 0.0002, "num_tokens": 13460134.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001863241195679, "sampling/importance_sampling_ratio/min": 0.3524653911590576, "sampling/sampling_logp_difference/max": 1.0428028106689453, "sampling/sampling_logp_difference/mean": 0.013139157555997372, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 257.328125, "completions/mean_terminated_length": 257.328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.29006510972976685, "epoch": 0.5294117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 1.082350928696217, "kl": 0.024110358208417892, "learning_rate": 9.82731675682987e-07, "loss": 0.017, "num_tokens": 13496539.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9817054271697998, "sampling/importance_sampling_ratio/mean": 1.0004513263702393, "sampling/importance_sampling_ratio/min": 0.27278974652290344, "sampling/sampling_logp_difference/max": 1.2990540266036987, "sampling/sampling_logp_difference/mean": 0.017118092626333237, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 191.890625, "completions/mean_terminated_length": 191.890625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.21561165153980255, "epoch": 0.5306372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 2.0240772764038137, "kl": 0.05167914927005768, "learning_rate": 9.825455832744266e-07, "loss": 0.0336, "num_tokens": 13529588.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001323223114014, "sampling/importance_sampling_ratio/min": 0.5724638104438782, "sampling/sampling_logp_difference/max": 0.724461555480957, "sampling/sampling_logp_difference/mean": 0.014281702227890491, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 284.90625, "completions/mean_terminated_length": 284.90625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.24130161106586456, "epoch": 0.5318627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.04439451275842723, "kl": 0.021637115627527237, "learning_rate": 9.823585113289023e-07, "loss": 0.0002, "num_tokens": 13576526.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000152349472046, "sampling/importance_sampling_ratio/min": 0.2516302466392517, "sampling/sampling_logp_difference/max": 1.3797945976257324, "sampling/sampling_logp_difference/mean": 0.016831308603286743, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2319016456604004, "epoch": 0.5330882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 1.4675323426382914, "kl": 0.04950553923845291, "learning_rate": 9.821704602261585e-07, "loss": 0.0118, "num_tokens": 13612814.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997687935829163, "sampling/importance_sampling_ratio/min": 0.5444316267967224, "sampling/sampling_logp_difference/max": 0.9254751205444336, "sampling/sampling_logp_difference/mean": 0.015987036749720573, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.18741881847381592, "epoch": 0.5343137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.602791349310115, "kl": 0.03300906717777252, "learning_rate": 9.819814303479267e-07, "loss": 0.0146, "num_tokens": 13640854.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9876214265823364, "sampling/importance_sampling_ratio/mean": 1.000852346420288, "sampling/importance_sampling_ratio/min": 0.30076903104782104, "sampling/sampling_logp_difference/max": 1.2014126777648926, "sampling/sampling_logp_difference/mean": 0.014924418181180954, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 165.578125, "completions/mean_terminated_length": 165.578125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.21863728761672974, "epoch": 0.5355392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.05073817343117938, "kl": 0.04330986738204956, "learning_rate": 9.817914220779256e-07, "loss": 0.0004, "num_tokens": 13669483.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000133514404297, "sampling/importance_sampling_ratio/min": 0.4652920067310333, "sampling/sampling_logp_difference/max": 0.8106462955474854, "sampling/sampling_logp_difference/mean": 0.01707840897142887, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1936556100845337, "epoch": 0.5367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.15335244089432232, "kl": 0.048208750784397125, "learning_rate": 9.816004358018603e-07, "loss": 0.0005, "num_tokens": 13695995.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000241994857788, "sampling/importance_sampling_ratio/min": 0.4078652560710907, "sampling/sampling_logp_difference/max": 0.9372653961181641, "sampling/sampling_logp_difference/mean": 0.015760913491249084, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 173.296875, "completions/mean_terminated_length": 173.296875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.16149796545505524, "epoch": 0.5379901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.058144732808689745, "kl": 0.0249975323677063, "learning_rate": 9.814084719074204e-07, "loss": 0.0002, "num_tokens": 13723310.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007734298706055, "sampling/importance_sampling_ratio/min": 0.5054357051849365, "sampling/sampling_logp_difference/max": 0.9605855941772461, "sampling/sampling_logp_difference/mean": 0.014678625389933586, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 209.4375, "completions/mean_terminated_length": 209.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.20305787026882172, "epoch": 0.5392156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.211216325141133, "kl": 0.06661221385002136, "learning_rate": 9.81215530784281e-07, "loss": -0.007, "num_tokens": 13752570.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6356521844863892, "sampling/importance_sampling_ratio/mean": 1.0004148483276367, "sampling/importance_sampling_ratio/min": 0.4958733022212982, "sampling/sampling_logp_difference/max": 0.701434850692749, "sampling/sampling_logp_difference/mean": 0.013643559068441391, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 154.40625, "completions/mean_terminated_length": 154.40625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.15312139689922333, "epoch": 0.5404411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0928211989498208, "kl": 0.023275483399629593, "learning_rate": 9.810216128240996e-07, "loss": 0.0002, "num_tokens": 13776740.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6610887050628662, "sampling/importance_sampling_ratio/mean": 0.9994547367095947, "sampling/importance_sampling_ratio/min": 0.45958274602890015, "sampling/sampling_logp_difference/max": 0.7774362564086914, "sampling/sampling_logp_difference/mean": 0.013531510718166828, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 221.78125, "completions/mean_terminated_length": 221.78125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.17526143789291382, "epoch": 0.5416666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.05870698110631625, "kl": 0.03668907284736633, "learning_rate": 9.808267184205181e-07, "loss": 0.0003, "num_tokens": 13812566.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999535322189331, "sampling/importance_sampling_ratio/min": 0.4309251308441162, "sampling/sampling_logp_difference/max": 0.8418209552764893, "sampling/sampling_logp_difference/mean": 0.014167534187436104, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 257.953125, "completions/mean_terminated_length": 257.953125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2431107461452484, "epoch": 0.5428921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.036190409674592666, "kl": 0.03278276324272156, "learning_rate": 9.806308479691594e-07, "loss": 0.0003, "num_tokens": 13850307.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.849806547164917, "sampling/importance_sampling_ratio/mean": 0.9998242259025574, "sampling/importance_sampling_ratio/min": 0.415968656539917, "sampling/sampling_logp_difference/max": 0.8771454095840454, "sampling/sampling_logp_difference/mean": 0.016216067597270012, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 153.953125, "completions/mean_terminated_length": 153.953125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22003903985023499, "epoch": 0.5441176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.2512830909674193, "kl": 0.06804344803094864, "learning_rate": 9.80434001867628e-07, "loss": 0.0006, "num_tokens": 13885056.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6728312969207764, "sampling/importance_sampling_ratio/mean": 0.9997739791870117, "sampling/importance_sampling_ratio/min": 0.28261417150497437, "sampling/sampling_logp_difference/max": 1.2636725902557373, "sampling/sampling_logp_difference/mean": 0.018444612622261047, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.18086552619934082, "epoch": 0.5453431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.18589688409829208, "kl": 0.03681885451078415, "learning_rate": 9.802361805155097e-07, "loss": 0.0004, "num_tokens": 13909120.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9687495231628418, "sampling/importance_sampling_ratio/mean": 1.0004655122756958, "sampling/importance_sampling_ratio/min": 0.3846133351325989, "sampling/sampling_logp_difference/max": 0.9555168151855469, "sampling/sampling_logp_difference/mean": 0.014596343040466309, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 247.53125, "completions/mean_terminated_length": 247.53125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.22909684479236603, "epoch": 0.5465686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 1.4066834384561027, "kl": 0.03148864209651947, "learning_rate": 9.800373843143683e-07, "loss": -0.0148, "num_tokens": 13952706.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.952415108680725, "sampling/importance_sampling_ratio/mean": 0.9993150234222412, "sampling/importance_sampling_ratio/min": 0.509168803691864, "sampling/sampling_logp_difference/max": 0.6749756336212158, "sampling/sampling_logp_difference/mean": 0.01676628738641739, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 192.53125, "completions/mean_terminated_length": 192.53125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.21134613454341888, "epoch": 0.5477941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.04649144067970036, "kl": 0.026450635865330696, "learning_rate": 9.798376136677484e-07, "loss": 0.0003, "num_tokens": 13981988.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6316014528274536, "sampling/importance_sampling_ratio/mean": 1.00005042552948, "sampling/importance_sampling_ratio/min": 0.36388882994651794, "sampling/sampling_logp_difference/max": 1.0109069347381592, "sampling/sampling_logp_difference/mean": 0.01675606518983841, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18353894352912903, "epoch": 0.5490196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.07195227881882807, "kl": 0.025355882942676544, "learning_rate": 9.796368689811712e-07, "loss": 0.0003, "num_tokens": 14008724.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007448196411133, "sampling/importance_sampling_ratio/min": 0.5391724109649658, "sampling/sampling_logp_difference/max": 0.854957103729248, "sampling/sampling_logp_difference/mean": 0.015763752162456512, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 160.984375, "completions/mean_terminated_length": 160.984375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.19537027180194855, "epoch": 0.5502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 2.1786144413384627, "kl": 0.043188292533159256, "learning_rate": 9.79435150662136e-07, "loss": -0.0323, "num_tokens": 14032211.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6207278966903687, "sampling/importance_sampling_ratio/mean": 0.9998423457145691, "sampling/importance_sampling_ratio/min": 0.5002011656761169, "sampling/sampling_logp_difference/max": 0.6927449703216553, "sampling/sampling_logp_difference/mean": 0.014301232062280178, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 240.484375, "completions/mean_terminated_length": 240.484375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1776406466960907, "epoch": 0.5514705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.1684315466351016, "kl": 0.01867382600903511, "learning_rate": 9.792324591201177e-07, "loss": -0.0018, "num_tokens": 14071186.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003172159194946, "sampling/importance_sampling_ratio/min": 0.4056454002857208, "sampling/sampling_logp_difference/max": 1.3454623222351074, "sampling/sampling_logp_difference/mean": 0.013690703548491001, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 202.015625, "completions/mean_terminated_length": 202.015625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.23214983940124512, "epoch": 0.5526960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.2704091917734717, "kl": 0.03874318674206734, "learning_rate": 9.790287947665681e-07, "loss": -0.0046, "num_tokens": 14106003.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998515248298645, "sampling/importance_sampling_ratio/min": 0.3239980638027191, "sampling/sampling_logp_difference/max": 1.1270177364349365, "sampling/sampling_logp_difference/mean": 0.015781264752149582, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 171.78125, "completions/mean_terminated_length": 171.78125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1903134286403656, "epoch": 0.553921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.06348400197571344, "kl": 0.02989325113594532, "learning_rate": 9.788241580149122e-07, "loss": 0.0003, "num_tokens": 14139461.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9100010395050049, "sampling/importance_sampling_ratio/mean": 0.9999560713768005, "sampling/importance_sampling_ratio/min": 0.5976356863975525, "sampling/sampling_logp_difference/max": 0.6471037864685059, "sampling/sampling_logp_difference/mean": 0.015057485550642014, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 138.28125, "completions/mean_terminated_length": 138.28125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.18952172994613647, "epoch": 0.5551470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.09591476875993776, "kl": 0.04611412063241005, "learning_rate": 9.786185492805501e-07, "loss": 0.0005, "num_tokens": 14163767.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996397495269775, "sampling/importance_sampling_ratio/min": 0.4265861511230469, "sampling/sampling_logp_difference/max": 0.8519408702850342, "sampling/sampling_logp_difference/mean": 0.01649576798081398, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 150.828125, "completions/mean_terminated_length": 150.828125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.1970883011817932, "epoch": 0.5563725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.04987476389528286, "kl": 0.04483035206794739, "learning_rate": 9.784119689808542e-07, "loss": 0.0004, "num_tokens": 14193116.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001142024993896, "sampling/importance_sampling_ratio/min": 0.4021225869655609, "sampling/sampling_logp_difference/max": 0.9109983444213867, "sampling/sampling_logp_difference/mean": 0.016262732446193695, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.1636163592338562, "epoch": 0.5575980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.8765250651093985, "kl": 0.016119275242090225, "learning_rate": 9.782044175351699e-07, "loss": -0.0518, "num_tokens": 14225196.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8127318620681763, "sampling/importance_sampling_ratio/mean": 0.9994531273841858, "sampling/importance_sampling_ratio/min": 0.5260621905326843, "sampling/sampling_logp_difference/max": 0.6423358917236328, "sampling/sampling_logp_difference/mean": 0.012707584537565708, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 186.34375, "completions/mean_terminated_length": 186.34375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.22686845064163208, "epoch": 0.5588235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.4617782678969478, "kl": 0.058060139417648315, "learning_rate": 9.779958953648129e-07, "loss": 0.0103, "num_tokens": 14256354.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003458261489868, "sampling/importance_sampling_ratio/min": 0.24500620365142822, "sampling/sampling_logp_difference/max": 2.2618141174316406, "sampling/sampling_logp_difference/mean": 0.017254013568162918, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.21629858016967773, "epoch": 0.5600490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.07544619848491381, "kl": 0.047936707735061646, "learning_rate": 9.777864028930705e-07, "loss": 0.0004, "num_tokens": 14282466.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003684759140015, "sampling/importance_sampling_ratio/min": 0.5428855419158936, "sampling/sampling_logp_difference/max": 0.7252051830291748, "sampling/sampling_logp_difference/mean": 0.01494472287595272, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.11612190306186676, "epoch": 0.5612745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.03621998780538075, "kl": 0.014105316251516342, "learning_rate": 9.775759405451986e-07, "loss": 0.0001, "num_tokens": 14316578.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000373125076294, "sampling/importance_sampling_ratio/min": 0.39854487776756287, "sampling/sampling_logp_difference/max": 0.9199352264404297, "sampling/sampling_logp_difference/mean": 0.009833992458879948, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.20763754844665527, "epoch": 0.5625, "frac_reward_zero_std": 0.75, "grad_norm": 1.7741586485869132, "kl": 0.03014390356838703, "learning_rate": 9.773645087484228e-07, "loss": -0.0053, "num_tokens": 14345858.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6122909784317017, "sampling/importance_sampling_ratio/mean": 0.9992804527282715, "sampling/importance_sampling_ratio/min": 0.43093565106391907, "sampling/sampling_logp_difference/max": 0.8417965173721313, "sampling/sampling_logp_difference/mean": 0.01546657457947731, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 248.46875, "completions/mean_terminated_length": 248.46875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.19547869265079498, "epoch": 0.5637254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.04159917279096464, "kl": 0.026621131226420403, "learning_rate": 9.771521079319363e-07, "loss": 0.0002, "num_tokens": 14382752.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9117350578308105, "sampling/importance_sampling_ratio/mean": 1.0008268356323242, "sampling/importance_sampling_ratio/min": 0.5671007037162781, "sampling/sampling_logp_difference/max": 0.6480112075805664, "sampling/sampling_logp_difference/mean": 0.013971008360385895, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 210.953125, "completions/mean_terminated_length": 210.953125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.21355372667312622, "epoch": 0.5649509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.262666879156973, "kl": 0.027984999120235443, "learning_rate": 9.76938738526899e-07, "loss": -0.0045, "num_tokens": 14417517.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994992017745972, "sampling/importance_sampling_ratio/min": 0.11738044768571854, "sampling/sampling_logp_difference/max": 2.1423349380493164, "sampling/sampling_logp_difference/mean": 0.015855856239795685, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.11451134085655212, "epoch": 0.5661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.030398948615140087, "kl": 0.010230224579572678, "learning_rate": 9.767244009664376e-07, "loss": 0.0001, "num_tokens": 14455885.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7448595762252808, "sampling/importance_sampling_ratio/mean": 0.9992393255233765, "sampling/importance_sampling_ratio/min": 0.41925525665283203, "sampling/sampling_logp_difference/max": 0.8692753314971924, "sampling/sampling_logp_difference/mean": 0.010393896140158176, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 214.96875, "completions/mean_terminated_length": 214.96875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.19482392072677612, "epoch": 0.5674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.049231483585913345, "kl": 0.027302060276269913, "learning_rate": 9.765090956856435e-07, "loss": 0.0002, "num_tokens": 14488043.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5804944038391113, "sampling/importance_sampling_ratio/mean": 0.9997097253799438, "sampling/importance_sampling_ratio/min": 0.18332743644714355, "sampling/sampling_logp_difference/max": 1.696481466293335, "sampling/sampling_logp_difference/mean": 0.013040466234087944, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 218.9375, "completions/mean_terminated_length": 218.9375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.20897147059440613, "epoch": 0.5686274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.5998888353151224, "kl": 0.03444429486989975, "learning_rate": 9.76292823121573e-07, "loss": -0.0919, "num_tokens": 14522503.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000460147857666, "sampling/importance_sampling_ratio/min": 0.4952988624572754, "sampling/sampling_logp_difference/max": 0.7552809715270996, "sampling/sampling_logp_difference/mean": 0.01724259927868843, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 242.953125, "completions/mean_terminated_length": 242.953125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20893335342407227, "epoch": 0.5698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.4165889380237286, "kl": 0.02970290556550026, "learning_rate": 9.760755837132457e-07, "loss": 0.1361, "num_tokens": 14563652.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001609325408936, "sampling/importance_sampling_ratio/min": 0.4954153895378113, "sampling/sampling_logp_difference/max": 1.0007104873657227, "sampling/sampling_logp_difference/mean": 0.014235898852348328, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 144.1875, "completions/mean_terminated_length": 144.1875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.1707545965909958, "epoch": 0.571078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.08934035938595794, "kl": 0.026488985866308212, "learning_rate": 9.758573779016436e-07, "loss": 0.0003, "num_tokens": 14584912.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007481575012207, "sampling/importance_sampling_ratio/min": 0.2991707921028137, "sampling/sampling_logp_difference/max": 1.2067406177520752, "sampling/sampling_logp_difference/mean": 0.014445105567574501, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 257.671875, "completions/mean_terminated_length": 257.671875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.17051297426223755, "epoch": 0.5723039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.1525910990314554, "kl": 0.021965259686112404, "learning_rate": 9.75638206129711e-07, "loss": 0.0003, "num_tokens": 14616699.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.697697401046753, "sampling/importance_sampling_ratio/mean": 0.9999450445175171, "sampling/importance_sampling_ratio/min": 0.4663052260875702, "sampling/sampling_logp_difference/max": 0.7629148960113525, "sampling/sampling_logp_difference/mean": 0.013579754158854485, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2851870059967041, "epoch": 0.5735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 2.1066582547543717, "kl": 0.053356803953647614, "learning_rate": 9.754180688423524e-07, "loss": -0.0102, "num_tokens": 14649371.0, "reward": 0.28125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001518726348877, "sampling/importance_sampling_ratio/min": 0.46914398670196533, "sampling/sampling_logp_difference/max": 0.7568455934524536, "sampling/sampling_logp_difference/mean": 0.018635809421539307, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 216.890625, "completions/mean_terminated_length": 216.890625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.24624094367027283, "epoch": 0.5747549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.3750226279227016, "kl": 0.030825216323137283, "learning_rate": 9.751969664864326e-07, "loss": -0.0707, "num_tokens": 14682580.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6819995641708374, "sampling/importance_sampling_ratio/mean": 0.9998189210891724, "sampling/importance_sampling_ratio/min": 0.2631002366542816, "sampling/sampling_logp_difference/max": 1.335220217704773, "sampling/sampling_logp_difference/mean": 0.017620351165533066, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 245.515625, "completions/mean_terminated_length": 245.515625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.1737528145313263, "epoch": 0.5759803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.03394921308531916, "kl": 0.017159651964902878, "learning_rate": 9.749748995107756e-07, "loss": 0.0002, "num_tokens": 14715765.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999651312828064, "sampling/importance_sampling_ratio/min": 0.4947980046272278, "sampling/sampling_logp_difference/max": 0.751502513885498, "sampling/sampling_logp_difference/mean": 0.014153973199427128, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 195.171875, "completions/mean_terminated_length": 195.171875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22031019628047943, "epoch": 0.5772058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 2.060635684604694, "kl": 0.03163802623748779, "learning_rate": 9.74751868366163e-07, "loss": 0.0385, "num_tokens": 14746240.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5807092189788818, "sampling/importance_sampling_ratio/mean": 0.9998636841773987, "sampling/importance_sampling_ratio/min": 0.5622367858886719, "sampling/sampling_logp_difference/max": 0.575832188129425, "sampling/sampling_logp_difference/mean": 0.01457374356687069, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 262.109375, "completions/mean_terminated_length": 262.109375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.26105907559394836, "epoch": 0.5784313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.1793484778611238, "kl": 0.025164538994431496, "learning_rate": 9.745278735053343e-07, "loss": 0.0405, "num_tokens": 14791287.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.969687581062317, "sampling/importance_sampling_ratio/mean": 1.0003187656402588, "sampling/importance_sampling_ratio/min": 0.19187341630458832, "sampling/sampling_logp_difference/max": 1.6509194374084473, "sampling/sampling_logp_difference/mean": 0.01665969006717205, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 285.921875, "completions/mean_terminated_length": 285.921875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2519185543060303, "epoch": 0.5796568627450981, "frac_reward_zero_std": 0.0, "grad_norm": 1.9432117433202565, "kl": 0.04478658735752106, "learning_rate": 9.743029153829845e-07, "loss": -0.035, "num_tokens": 14829010.0, "reward": 0.5625, "reward_std": 0.6707825064659119, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9441853761672974, "sampling/importance_sampling_ratio/mean": 1.0006245374679565, "sampling/importance_sampling_ratio/min": 0.536237895488739, "sampling/sampling_logp_difference/max": 0.6648430824279785, "sampling/sampling_logp_difference/mean": 0.01648460328578949, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.22762653231620789, "epoch": 0.5808823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 1.4965402464777053, "kl": 0.030943382531404495, "learning_rate": 9.740769944557644e-07, "loss": -0.0013, "num_tokens": 14863714.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8166251182556152, "sampling/importance_sampling_ratio/mean": 1.0000070333480835, "sampling/importance_sampling_ratio/min": 0.4871021509170532, "sampling/sampling_logp_difference/max": 0.7192814350128174, "sampling/sampling_logp_difference/mean": 0.016907792538404465, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 156.59375, "completions/mean_terminated_length": 156.59375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.24233081936836243, "epoch": 0.5821078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.636056679673925, "kl": 0.04024767875671387, "learning_rate": 9.738501111822792e-07, "loss": -0.0159, "num_tokens": 14896408.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000391960144043, "sampling/importance_sampling_ratio/min": 0.3063385486602783, "sampling/sampling_logp_difference/max": 1.311995267868042, "sampling/sampling_logp_difference/mean": 0.018402356654405594, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 147.84375, "completions/mean_terminated_length": 147.84375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.13406597077846527, "epoch": 0.5833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.10571691731999149, "kl": 0.020086705684661865, "learning_rate": 9.736222660230878e-07, "loss": 0.0002, "num_tokens": 14927758.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8623803853988647, "sampling/importance_sampling_ratio/mean": 1.0003776550292969, "sampling/importance_sampling_ratio/min": 0.3642270565032959, "sampling/sampling_logp_difference/max": 1.0099778175354004, "sampling/sampling_logp_difference/mean": 0.013062086887657642, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 212.078125, "completions/mean_terminated_length": 212.078125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2687431275844574, "epoch": 0.5845588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.04975967264875057, "kl": 0.02250150591135025, "learning_rate": 9.73393459440701e-07, "loss": 0.0002, "num_tokens": 14961219.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7017096281051636, "sampling/importance_sampling_ratio/mean": 0.9993106126785278, "sampling/importance_sampling_ratio/min": 0.410661906003952, "sampling/sampling_logp_difference/max": 0.8899850845336914, "sampling/sampling_logp_difference/mean": 0.01792370155453682, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 360.171875, "completions/mean_terminated_length": 286.5238342285156, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2159585952758789, "epoch": 0.5857843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.0133469177612149, "kl": 0.016694650053977966, "learning_rate": 9.73163691899582e-07, "loss": 0.0339, "num_tokens": 15004382.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003795623779297, "sampling/importance_sampling_ratio/min": 0.4347582757472992, "sampling/sampling_logp_difference/max": 0.8329651355743408, "sampling/sampling_logp_difference/mean": 0.013661239296197891, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 128.109375, "completions/mean_terminated_length": 128.109375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.1839389204978943, "epoch": 0.5870098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.07097632624473711, "kl": 0.03193610906600952, "learning_rate": 9.729329638661444e-07, "loss": 0.0003, "num_tokens": 15030645.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9905723333358765, "sampling/importance_sampling_ratio/mean": 0.9994369149208069, "sampling/importance_sampling_ratio/min": 0.48416680097579956, "sampling/sampling_logp_difference/max": 0.7253258228302002, "sampling/sampling_logp_difference/mean": 0.01541130430996418, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 291.890625, "completions/mean_terminated_length": 291.890625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1650594174861908, "epoch": 0.5882352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.04103054333466548, "kl": 0.017028018832206726, "learning_rate": 9.727012758087512e-07, "loss": 0.0002, "num_tokens": 15068974.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998735189437866, "sampling/importance_sampling_ratio/min": 0.47690320014953613, "sampling/sampling_logp_difference/max": 0.7404417991638184, "sampling/sampling_logp_difference/mean": 0.012647148221731186, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 228.65625, "completions/mean_terminated_length": 228.65625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2747301459312439, "epoch": 0.5894607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 1.8292346976018947, "kl": 0.03598939999938011, "learning_rate": 9.724686281977146e-07, "loss": 0.0381, "num_tokens": 15103848.0, "reward": -0.03125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8844454288482666, "sampling/importance_sampling_ratio/mean": 1.000560998916626, "sampling/importance_sampling_ratio/min": 0.272928386926651, "sampling/sampling_logp_difference/max": 1.2985458374023438, "sampling/sampling_logp_difference/mean": 0.01755647361278534, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 174.578125, "completions/mean_terminated_length": 174.578125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.2639964520931244, "epoch": 0.5906862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.6421195753360995, "kl": 0.039081934839487076, "learning_rate": 9.722350215052946e-07, "loss": -0.0048, "num_tokens": 15137453.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.9253288507461548, "sampling/importance_sampling_ratio/mean": 1.0001544952392578, "sampling/importance_sampling_ratio/min": 0.4358985722064972, "sampling/sampling_logp_difference/max": 0.8303457498550415, "sampling/sampling_logp_difference/mean": 0.019417976960539818, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 198.59375, "completions/mean_terminated_length": 198.59375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.16699634492397308, "epoch": 0.5919117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.04647039796785461, "kl": 0.022983193397521973, "learning_rate": 9.720004562056979e-07, "loss": 0.0002, "num_tokens": 15168051.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.793419361114502, "sampling/importance_sampling_ratio/mean": 1.0000264644622803, "sampling/importance_sampling_ratio/min": 0.5038398504257202, "sampling/sampling_logp_difference/max": 0.6854968070983887, "sampling/sampling_logp_difference/mean": 0.012654967606067657, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 182.703125, "completions/mean_terminated_length": 182.703125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.24964210391044617, "epoch": 0.5931372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.4106894254929927, "kl": 0.048736389726400375, "learning_rate": 9.717649327750773e-07, "loss": -0.0088, "num_tokens": 15198944.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993613958358765, "sampling/importance_sampling_ratio/min": 0.4776141345500946, "sampling/sampling_logp_difference/max": 1.0320630073547363, "sampling/sampling_logp_difference/mean": 0.018597744405269623, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 204.65625, "completions/mean_terminated_length": 204.65625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.18694047629833221, "epoch": 0.5943627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.05743230416692876, "kl": 0.027357913553714752, "learning_rate": 9.7152845169153e-07, "loss": 0.0003, "num_tokens": 15229242.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9612669944763184, "sampling/importance_sampling_ratio/mean": 1.000275731086731, "sampling/importance_sampling_ratio/min": 0.44152477383613586, "sampling/sampling_logp_difference/max": 0.8175210952758789, "sampling/sampling_logp_difference/mean": 0.01288670301437378, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 279.234375, "completions/mean_terminated_length": 279.234375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.26772433519363403, "epoch": 0.5955882352941176, "frac_reward_zero_std": 0.25, "grad_norm": 2.518300448825345, "kl": 0.021690839901566505, "learning_rate": 9.712910134350984e-07, "loss": -0.0305, "num_tokens": 15264617.0, "reward": 0.1875, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998605251312256, "sampling/importance_sampling_ratio/min": 0.4105510711669922, "sampling/sampling_logp_difference/max": 1.3690159320831299, "sampling/sampling_logp_difference/mean": 0.017053838819265366, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 193.078125, "completions/mean_terminated_length": 193.078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.20557519793510437, "epoch": 0.5968137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.4254188091002689, "kl": 0.029017888009548187, "learning_rate": 9.710526184877666e-07, "loss": -0.0031, "num_tokens": 15291246.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8111883401870728, "sampling/importance_sampling_ratio/mean": 1.000805377960205, "sampling/importance_sampling_ratio/min": 0.5910916924476624, "sampling/sampling_logp_difference/max": 0.5939831733703613, "sampling/sampling_logp_difference/mean": 0.013951467350125313, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 150.71875, "completions/mean_terminated_length": 150.71875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2271089106798172, "epoch": 0.5980392156862745, "frac_reward_zero_std": 0.5, "grad_norm": 2.188118279180718, "kl": 0.03827241063117981, "learning_rate": 9.708132673334615e-07, "loss": 0.0009, "num_tokens": 15317116.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997679591178894, "sampling/importance_sampling_ratio/min": 0.22370855510234833, "sampling/sampling_logp_difference/max": 1.4974112510681152, "sampling/sampling_logp_difference/mean": 0.016535578295588493, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.333241730928421, "epoch": 0.5992647058823529, "frac_reward_zero_std": 0.25, "grad_norm": 2.288477586033347, "kl": 0.030107948929071426, "learning_rate": 9.705729604580505e-07, "loss": 0.0076, "num_tokens": 15351376.0, "reward": 0.53125, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6156491041183472, "sampling/importance_sampling_ratio/mean": 1.0002347230911255, "sampling/importance_sampling_ratio/min": 0.4024190306663513, "sampling/sampling_logp_difference/max": 0.9102613925933838, "sampling/sampling_logp_difference/mean": 0.018553704023361206, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 173.578125, "completions/mean_terminated_length": 173.578125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.20034442842006683, "epoch": 0.6004901960784313, "frac_reward_zero_std": 0.5, "grad_norm": 2.652665815530287, "kl": 0.04240548610687256, "learning_rate": 9.703316983493412e-07, "loss": 0.0079, "num_tokens": 15379189.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004112720489502, "sampling/importance_sampling_ratio/min": 0.5910095572471619, "sampling/sampling_logp_difference/max": 0.7670514583587646, "sampling/sampling_logp_difference/mean": 0.014264474622905254, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 311.359375, "completions/mean_terminated_length": 311.359375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.24044373631477356, "epoch": 0.6017156862745098, "frac_reward_zero_std": 0.0, "grad_norm": 2.1011466606746954, "kl": 0.04351915419101715, "learning_rate": 9.700894814970808e-07, "loss": 0.0258, "num_tokens": 15413612.0, "reward": 0.25, "reward_std": 0.8767043352127075, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.8453627824783325, "sampling/importance_sampling_ratio/mean": 0.9997578859329224, "sampling/importance_sampling_ratio/min": 0.6080636978149414, "sampling/sampling_logp_difference/max": 0.6126759052276611, "sampling/sampling_logp_difference/mean": 0.014367097988724709, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 191.953125, "completions/mean_terminated_length": 191.953125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.28681284189224243, "epoch": 0.6029411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.8833523639178975, "kl": 0.04870586842298508, "learning_rate": 9.698463103929541e-07, "loss": 0.0007, "num_tokens": 15442297.0, "reward": 0.34375, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5971494913101196, "sampling/importance_sampling_ratio/mean": 0.9996565580368042, "sampling/importance_sampling_ratio/min": 0.4826450049877167, "sampling/sampling_logp_difference/max": 0.7284739017486572, "sampling/sampling_logp_difference/mean": 0.01625833474099636, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 187.0625, "completions/mean_terminated_length": 187.0625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.20023608207702637, "epoch": 0.6041666666666666, "frac_reward_zero_std": 0.5, "grad_norm": 2.173857085033842, "kl": 0.03565539792180061, "learning_rate": 9.69602185530583e-07, "loss": -0.0005, "num_tokens": 15472461.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011024475097656, "sampling/importance_sampling_ratio/min": 0.18803824484348297, "sampling/sampling_logp_difference/max": 1.671109914779663, "sampling/sampling_logp_difference/mean": 0.01602831482887268, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 199.921875, "completions/mean_terminated_length": 199.921875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2912241816520691, "epoch": 0.6053921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 2.1771468516592747, "kl": 0.04060687497258186, "learning_rate": 9.693571074055254e-07, "loss": 0.0522, "num_tokens": 15501400.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995718002319336, "sampling/importance_sampling_ratio/min": 0.5355345606803894, "sampling/sampling_logp_difference/max": 1.2372488975524902, "sampling/sampling_logp_difference/mean": 0.01795007474720478, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 241.625, "completions/mean_terminated_length": 241.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2918547987937927, "epoch": 0.6066176470588235, "frac_reward_zero_std": 0.25, "grad_norm": 2.1390454655609363, "kl": 0.04890880733728409, "learning_rate": 9.691110765152744e-07, "loss": -0.0128, "num_tokens": 15534832.0, "reward": -0.53125, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": -0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995063543319702, "sampling/importance_sampling_ratio/min": 0.39389124512672424, "sampling/sampling_logp_difference/max": 0.93168044090271, "sampling/sampling_logp_difference/mean": 0.0170971117913723, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 202.90625, "completions/mean_terminated_length": 202.90625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22431150078773499, "epoch": 0.6078431372549019, "frac_reward_zero_std": 0.75, "grad_norm": 1.1644071437761385, "kl": 0.04543592780828476, "learning_rate": 9.688640933592572e-07, "loss": 0.0014, "num_tokens": 15563114.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.8400806188583374, "sampling/importance_sampling_ratio/mean": 0.9998714327812195, "sampling/importance_sampling_ratio/min": 0.5355109572410583, "sampling/sampling_logp_difference/max": 0.6245338916778564, "sampling/sampling_logp_difference/mean": 0.013310464099049568, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 247.390625, "completions/mean_terminated_length": 247.390625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.32082051038742065, "epoch": 0.6090686274509803, "frac_reward_zero_std": 0.0, "grad_norm": 2.7337860353603984, "kl": 0.038668520748615265, "learning_rate": 9.686161584388339e-07, "loss": -0.0442, "num_tokens": 15595955.0, "reward": 0.1875, "reward_std": 0.7473389506340027, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.8616005182266235, "sampling/importance_sampling_ratio/mean": 1.001033067703247, "sampling/importance_sampling_ratio/min": 0.4488525390625, "sampling/sampling_logp_difference/max": 0.8010609149932861, "sampling/sampling_logp_difference/mean": 0.020086873322725296, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 198.28125, "completions/mean_terminated_length": 198.28125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.36790406703948975, "epoch": 0.6102941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 2.712605465705619, "kl": 0.052883580327034, "learning_rate": 9.683672722572966e-07, "loss": -0.0124, "num_tokens": 15625461.0, "reward": 0.25, "reward_std": 0.8881268501281738, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995077848434448, "sampling/importance_sampling_ratio/min": 0.2736213505268097, "sampling/sampling_logp_difference/max": 1.2960100173950195, "sampling/sampling_logp_difference/mean": 0.021305786445736885, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 296.984375, "completions/mean_terminated_length": 296.984375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2703689634799957, "epoch": 0.6115196078431373, "frac_reward_zero_std": 0.25, "grad_norm": 1.637297883464752, "kl": 0.024464547634124756, "learning_rate": 9.681174353198686e-07, "loss": -0.0025, "num_tokens": 15662500.0, "reward": 0.75, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.9148050546646118, "sampling/importance_sampling_ratio/mean": 0.9993337392807007, "sampling/importance_sampling_ratio/min": 0.4634568393230438, "sampling/sampling_logp_difference/max": 0.7690420150756836, "sampling/sampling_logp_difference/mean": 0.015952816233038902, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 187.9375, "completions/mean_terminated_length": 187.9375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.25232523679733276, "epoch": 0.6127450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 2.2686866057794326, "kl": 0.03192438930273056, "learning_rate": 9.678666481337031e-07, "loss": -0.079, "num_tokens": 15692992.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6614391803741455, "sampling/importance_sampling_ratio/mean": 1.0009832382202148, "sampling/importance_sampling_ratio/min": 0.4818120300769806, "sampling/sampling_logp_difference/max": 0.730201244354248, "sampling/sampling_logp_difference/mean": 0.016551656648516655, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 245.5625, "completions/mean_terminated_length": 245.5625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.25726181268692017, "epoch": 0.6139705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.560473235310796, "kl": 0.053573839366436005, "learning_rate": 9.67614911207882e-07, "loss": -0.0075, "num_tokens": 15725348.0, "reward": -0.1875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992358088493347, "sampling/importance_sampling_ratio/min": 0.4940890669822693, "sampling/sampling_logp_difference/max": 0.7050395011901855, "sampling/sampling_logp_difference/mean": 0.014738351106643677, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 285.0625, "completions/mean_terminated_length": 285.0625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.21006496250629425, "epoch": 0.6151960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.7539990683844793, "kl": 0.02914992719888687, "learning_rate": 9.673622250534155e-07, "loss": -0.0089, "num_tokens": 15763160.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5782129764556885, "sampling/importance_sampling_ratio/mean": 0.9996501207351685, "sampling/importance_sampling_ratio/min": 0.4119178354740143, "sampling/sampling_logp_difference/max": 0.8869314193725586, "sampling/sampling_logp_difference/mean": 0.01188575103878975, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 144.296875, "completions/mean_terminated_length": 144.296875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.16736435890197754, "epoch": 0.616421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.6543420321245603, "kl": 0.04313846677541733, "learning_rate": 9.671085901832404e-07, "loss": 0.048, "num_tokens": 15785435.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9100173711776733, "sampling/importance_sampling_ratio/mean": 0.9994516968727112, "sampling/importance_sampling_ratio/min": 0.04118496552109718, "sampling/sampling_logp_difference/max": 3.1896820068359375, "sampling/sampling_logp_difference/mean": 0.013791139237582684, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 191.046875, "completions/mean_terminated_length": 191.046875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.22429190576076508, "epoch": 0.6176470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 2.4627974240636274, "kl": 0.045783478766679764, "learning_rate": 9.668540071122195e-07, "loss": -0.0016, "num_tokens": 15813630.0, "reward": 0.21875, "reward_std": 0.6505630612373352, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.7294622659683228, "sampling/importance_sampling_ratio/mean": 0.999798595905304, "sampling/importance_sampling_ratio/min": 0.37196803092956543, "sampling/sampling_logp_difference/max": 0.9889473915100098, "sampling/sampling_logp_difference/mean": 0.015147536993026733, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 217.34375, "completions/mean_terminated_length": 217.34375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.27516812086105347, "epoch": 0.6188725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 1.5642094762042984, "kl": 0.06203371286392212, "learning_rate": 9.665984763571402e-07, "loss": -0.0135, "num_tokens": 15845908.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5175708532333374, "sampling/importance_sampling_ratio/mean": 0.9990229606628418, "sampling/importance_sampling_ratio/min": 0.41066715121269226, "sampling/sampling_logp_difference/max": 0.8899722099304199, "sampling/sampling_logp_difference/mean": 0.01645125448703766, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23022127151489258, "epoch": 0.6200980392156863, "frac_reward_zero_std": 0.25, "grad_norm": 2.063309763162759, "kl": 0.04602504521608353, "learning_rate": 9.663419984367137e-07, "loss": -0.0305, "num_tokens": 15877268.0, "reward": 0.0625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.817983865737915, "sampling/importance_sampling_ratio/mean": 1.0002416372299194, "sampling/importance_sampling_ratio/min": 0.6171835660934448, "sampling/sampling_logp_difference/max": 0.5977281332015991, "sampling/sampling_logp_difference/mean": 0.015211190097033978, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2860516309738159, "epoch": 0.6213235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.3938382172467045, "kl": 0.05346071720123291, "learning_rate": 9.660845738715742e-07, "loss": 0.0177, "num_tokens": 15907652.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 0.9996599555015564, "sampling/importance_sampling_ratio/min": 0.5685405731201172, "sampling/sampling_logp_difference/max": 0.5646826028823853, "sampling/sampling_logp_difference/mean": 0.01554498914629221, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.33097076416015625, "epoch": 0.6225490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 2.13984277246622, "kl": 0.0651451051235199, "learning_rate": 9.658262031842769e-07, "loss": -0.0326, "num_tokens": 15937208.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.7676414251327515, "sampling/importance_sampling_ratio/mean": 1.0007094144821167, "sampling/importance_sampling_ratio/min": 0.5665751695632935, "sampling/sampling_logp_difference/max": 0.5696461200714111, "sampling/sampling_logp_difference/mean": 0.018425047397613525, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 187.65625, "completions/mean_terminated_length": 187.65625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.24647732079029083, "epoch": 0.6237745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 1.771429223174817, "kl": 0.056372880935668945, "learning_rate": 9.655668868992983e-07, "loss": -0.0004, "num_tokens": 15968802.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991722106933594, "sampling/importance_sampling_ratio/min": 0.09368312358856201, "sampling/sampling_logp_difference/max": 2.3678371906280518, "sampling/sampling_logp_difference/mean": 0.016202397644519806, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 152.09375, "completions/mean_terminated_length": 152.09375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21541383862495422, "epoch": 0.625, "frac_reward_zero_std": 0.25, "grad_norm": 2.9867445811566165, "kl": 0.05038762837648392, "learning_rate": 9.653066255430338e-07, "loss": 0.0541, "num_tokens": 15994024.0, "reward": 0.46875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999164342880249, "sampling/importance_sampling_ratio/min": 0.34088990092277527, "sampling/sampling_logp_difference/max": 1.0761957168579102, "sampling/sampling_logp_difference/mean": 0.01494535617530346, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 235.359375, "completions/mean_terminated_length": 235.359375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3029787242412567, "epoch": 0.6262254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 1.4808825125445177, "kl": 0.041609782725572586, "learning_rate": 9.650454196437973e-07, "loss": -0.0542, "num_tokens": 16024591.0, "reward": -0.03125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6658228635787964, "sampling/importance_sampling_ratio/mean": 0.9998339414596558, "sampling/importance_sampling_ratio/min": 0.5049895644187927, "sampling/sampling_logp_difference/max": 0.6832175254821777, "sampling/sampling_logp_difference/mean": 0.015195919200778008, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 198.53125, "completions/mean_terminated_length": 198.53125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.24742618203163147, "epoch": 0.6274509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.4148743369693726, "kl": 0.04469574987888336, "learning_rate": 9.647832697318206e-07, "loss": -0.0317, "num_tokens": 16057089.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003695487976074, "sampling/importance_sampling_ratio/min": 0.23465335369110107, "sampling/sampling_logp_difference/max": 1.44964599609375, "sampling/sampling_logp_difference/mean": 0.015340734273195267, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 254.765625, "completions/mean_terminated_length": 254.765625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2867315411567688, "epoch": 0.6286764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.5370886558265742, "kl": 0.05973536893725395, "learning_rate": 9.645201763392513e-07, "loss": 0.0284, "num_tokens": 16091170.0, "reward": 0.53125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996262788772583, "sampling/importance_sampling_ratio/min": 0.4404844045639038, "sampling/sampling_logp_difference/max": 0.8198802471160889, "sampling/sampling_logp_difference/mean": 0.015395834110677242, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 154.671875, "completions/mean_terminated_length": 154.671875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.16976070404052734, "epoch": 0.6299019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 2.388522938175457, "kl": 0.05982288718223572, "learning_rate": 9.64256140000152e-07, "loss": -0.0058, "num_tokens": 16119709.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006251335144043, "sampling/importance_sampling_ratio/min": 0.514415442943573, "sampling/sampling_logp_difference/max": 0.7173492908477783, "sampling/sampling_logp_difference/mean": 0.013128705322742462, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 198.609375, "completions/mean_terminated_length": 198.609375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.21194711327552795, "epoch": 0.6311274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.1475040650361064, "kl": 0.05573735758662224, "learning_rate": 9.639911612505003e-07, "loss": 0.0451, "num_tokens": 16153252.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5551807880401611, "sampling/importance_sampling_ratio/mean": 1.0002050399780273, "sampling/importance_sampling_ratio/min": 0.3899836838245392, "sampling/sampling_logp_difference/max": 0.941650390625, "sampling/sampling_logp_difference/mean": 0.014565443620085716, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 181.421875, "completions/mean_terminated_length": 181.421875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.20764178037643433, "epoch": 0.6323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 2.1720468058929345, "kl": 0.05153400078415871, "learning_rate": 9.63725240628186e-07, "loss": -0.0403, "num_tokens": 16182751.0, "reward": -0.09375, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004583597183228, "sampling/importance_sampling_ratio/min": 0.6154090762138367, "sampling/sampling_logp_difference/max": 0.8144304752349854, "sampling/sampling_logp_difference/mean": 0.012461901642382145, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 176.109375, "completions/mean_terminated_length": 176.109375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.27043965458869934, "epoch": 0.633578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.9558631710433498, "kl": 0.07393071800470352, "learning_rate": 9.634583786730108e-07, "loss": 0.0777, "num_tokens": 16212310.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6234568357467651, "sampling/importance_sampling_ratio/mean": 1.0003788471221924, "sampling/importance_sampling_ratio/min": 0.4622943103313446, "sampling/sampling_logp_difference/max": 0.7715535163879395, "sampling/sampling_logp_difference/mean": 0.01648622192442417, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 245.734375, "completions/mean_terminated_length": 245.734375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.18301594257354736, "epoch": 0.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.03779742217316824, "kl": 0.0384780615568161, "learning_rate": 9.63190575926688e-07, "loss": 0.0003, "num_tokens": 16246037.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5963561534881592, "sampling/importance_sampling_ratio/mean": 0.9991670250892639, "sampling/importance_sampling_ratio/min": 0.5856403708457947, "sampling/sampling_logp_difference/max": 0.5350494384765625, "sampling/sampling_logp_difference/mean": 0.012074870988726616, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 210.0625, "completions/mean_terminated_length": 210.0625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3609851002693176, "epoch": 0.6360294117647058, "frac_reward_zero_std": 0.25, "grad_norm": 1.6217027238359476, "kl": 0.06879754364490509, "learning_rate": 9.6292183293284e-07, "loss": -0.0125, "num_tokens": 16279049.0, "reward": 0.0, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5075724124908447, "sampling/importance_sampling_ratio/mean": 1.000047206878662, "sampling/importance_sampling_ratio/min": 0.5625096559524536, "sampling/sampling_logp_difference/max": 0.5753469467163086, "sampling/sampling_logp_difference/mean": 0.017157625406980515, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 298.671875, "completions/mean_terminated_length": 298.671875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2705211043357849, "epoch": 0.6372549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 1.709078537110251, "kl": 0.044981539249420166, "learning_rate": 9.626521502369983e-07, "loss": 0.0487, "num_tokens": 16311732.0, "reward": 0.46875, "reward_std": 0.5722135901451111, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.818015694618225, "sampling/importance_sampling_ratio/mean": 0.999798595905304, "sampling/importance_sampling_ratio/min": 0.12197814881801605, "sampling/sampling_logp_difference/max": 2.1039133071899414, "sampling/sampling_logp_difference/mean": 0.013444855809211731, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 224.296875, "completions/mean_terminated_length": 224.296875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.23086273670196533, "epoch": 0.6384803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.002731924335962, "kl": 0.05612014979124069, "learning_rate": 9.623815283866015e-07, "loss": -0.018, "num_tokens": 16342279.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7553222179412842, "sampling/importance_sampling_ratio/mean": 0.9998987317085266, "sampling/importance_sampling_ratio/min": 0.510368287563324, "sampling/sampling_logp_difference/max": 0.6726226806640625, "sampling/sampling_logp_difference/mean": 0.013258611783385277, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 148.21875, "completions/mean_terminated_length": 148.21875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.24009597301483154, "epoch": 0.6397058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.865036848073618, "kl": 0.08404193073511124, "learning_rate": 9.621099679309946e-07, "loss": 0.0149, "num_tokens": 16368309.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000610589981079, "sampling/importance_sampling_ratio/min": 0.591404139995575, "sampling/sampling_logp_difference/max": 0.7522697448730469, "sampling/sampling_logp_difference/mean": 0.015585711225867271, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 177.28125, "completions/mean_terminated_length": 177.28125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.22186435759067535, "epoch": 0.6409313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 1.80332290150908, "kl": 0.08081275224685669, "learning_rate": 9.618374694214285e-07, "loss": 0.0786, "num_tokens": 16393751.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5947540998458862, "sampling/importance_sampling_ratio/mean": 1.0002617835998535, "sampling/importance_sampling_ratio/min": 0.6109259128570557, "sampling/sampling_logp_difference/max": 0.49277955293655396, "sampling/sampling_logp_difference/mean": 0.013700183480978012, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 191.1875, "completions/mean_terminated_length": 191.1875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.35577911138534546, "epoch": 0.6421568627450981, "frac_reward_zero_std": 0.0, "grad_norm": 2.4853327131233844, "kl": 0.10113876312971115, "learning_rate": 9.615640334110578e-07, "loss": -0.0734, "num_tokens": 16428339.0, "reward": 0.09375, "reward_std": 0.7931214570999146, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000748872756958, "sampling/importance_sampling_ratio/min": 0.5403655171394348, "sampling/sampling_logp_difference/max": 0.842801570892334, "sampling/sampling_logp_difference/mean": 0.018896231427788734, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 147.40625, "completions/mean_terminated_length": 147.40625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2719324231147766, "epoch": 0.6433823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 1.6063992458995404, "kl": 0.0841350331902504, "learning_rate": 9.612896604549401e-07, "loss": -0.0053, "num_tokens": 16452637.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.826122760772705, "sampling/importance_sampling_ratio/mean": 1.000278115272522, "sampling/importance_sampling_ratio/min": 0.6413664221763611, "sampling/sampling_logp_difference/max": 0.6021950244903564, "sampling/sampling_logp_difference/mean": 0.01662953570485115, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 169.484375, "completions/mean_terminated_length": 169.484375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2781379222869873, "epoch": 0.6446078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.1459635014793854, "kl": 0.0846191793680191, "learning_rate": 9.610143511100354e-07, "loss": 0.0113, "num_tokens": 16478364.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.65408456325531, "sampling/importance_sampling_ratio/mean": 1.0004284381866455, "sampling/importance_sampling_ratio/min": 0.5941311717033386, "sampling/sampling_logp_difference/max": 0.5206551551818848, "sampling/sampling_logp_difference/mean": 0.015035307966172695, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 189.625, "completions/mean_terminated_length": 189.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.27831029891967773, "epoch": 0.6458333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 1.161012926026988, "kl": 0.10464677959680557, "learning_rate": 9.607381059352038e-07, "loss": 0.0051, "num_tokens": 16510692.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5662132501602173, "sampling/importance_sampling_ratio/mean": 0.9999711513519287, "sampling/importance_sampling_ratio/min": 0.5038020610809326, "sampling/sampling_logp_difference/max": 0.6855719089508057, "sampling/sampling_logp_difference/mean": 0.016203096136450768, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 198.09375, "completions/mean_terminated_length": 198.09375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2400476336479187, "epoch": 0.6470588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 1.2877264244056412, "kl": 0.07220980525016785, "learning_rate": 9.60460925491206e-07, "loss": -0.0051, "num_tokens": 16543786.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6094046831130981, "sampling/importance_sampling_ratio/mean": 0.9998757839202881, "sampling/importance_sampling_ratio/min": 0.5140842199325562, "sampling/sampling_logp_difference/max": 0.6653681993484497, "sampling/sampling_logp_difference/mean": 0.014955186285078526, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 167.28125, "completions/mean_terminated_length": 167.28125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2575138807296753, "epoch": 0.6482843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.3237644518946134, "kl": 0.07371678948402405, "learning_rate": 9.601828103407004e-07, "loss": -0.008, "num_tokens": 16575372.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997285008430481, "sampling/importance_sampling_ratio/min": 0.5245344042778015, "sampling/sampling_logp_difference/max": 0.8339135646820068, "sampling/sampling_logp_difference/mean": 0.01670641452074051, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 183.578125, "completions/mean_terminated_length": 183.578125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.23362132906913757, "epoch": 0.6495098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.06795336830070159, "kl": 0.07781791687011719, "learning_rate": 9.599037610482433e-07, "loss": 0.0007, "num_tokens": 16604929.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000827312469482, "sampling/importance_sampling_ratio/min": 0.4381314516067505, "sampling/sampling_logp_difference/max": 0.8252363204956055, "sampling/sampling_logp_difference/mean": 0.014331628568470478, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 193.4375, "completions/mean_terminated_length": 193.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2269354909658432, "epoch": 0.6507352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.05150817261310415, "kl": 0.05640610307455063, "learning_rate": 9.59623778180287e-07, "loss": 0.0006, "num_tokens": 16635693.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002391338348389, "sampling/importance_sampling_ratio/min": 0.4477214515209198, "sampling/sampling_logp_difference/max": 0.8460354804992676, "sampling/sampling_logp_difference/mean": 0.014656702056527138, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 212.6875, "completions/mean_terminated_length": 212.6875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.29390576481819153, "epoch": 0.6519607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.9920082670028044, "kl": 0.1033441424369812, "learning_rate": 9.593428623051791e-07, "loss": 0.0022, "num_tokens": 16667257.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5775432586669922, "sampling/importance_sampling_ratio/mean": 1.0003143548965454, "sampling/importance_sampling_ratio/min": 0.29912951588630676, "sampling/sampling_logp_difference/max": 1.206878662109375, "sampling/sampling_logp_difference/mean": 0.016023050993680954, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2245170921087265, "epoch": 0.6531862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.06037153225127418, "kl": 0.06282234191894531, "learning_rate": 9.59061013993161e-07, "loss": 0.0006, "num_tokens": 16700473.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5690274238586426, "sampling/importance_sampling_ratio/mean": 0.999611496925354, "sampling/importance_sampling_ratio/min": 0.5601468682289124, "sampling/sampling_logp_difference/max": 0.5795562267303467, "sampling/sampling_logp_difference/mean": 0.014398027211427689, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 245.921875, "completions/mean_terminated_length": 245.921875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2711595892906189, "epoch": 0.6544117647058824, "frac_reward_zero_std": 0.25, "grad_norm": 1.9371063223721268, "kl": 0.08010272681713104, "learning_rate": 9.587782338163667e-07, "loss": 0.0662, "num_tokens": 16735188.0, "reward": 0.8125, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.862383484840393, "sampling/importance_sampling_ratio/mean": 1.000108242034912, "sampling/importance_sampling_ratio/min": 0.2419363260269165, "sampling/sampling_logp_difference/max": 1.4190807342529297, "sampling/sampling_logp_difference/mean": 0.015247123315930367, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 214.09375, "completions/mean_terminated_length": 214.09375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.29227620363235474, "epoch": 0.6556372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.05571439573830488, "kl": 0.091652050614357, "learning_rate": 9.584945223488226e-07, "loss": 0.0009, "num_tokens": 16769834.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6192150115966797, "sampling/importance_sampling_ratio/mean": 1.0003212690353394, "sampling/importance_sampling_ratio/min": 0.564578652381897, "sampling/sampling_logp_difference/max": 0.5716755390167236, "sampling/sampling_logp_difference/mean": 0.015373600646853447, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 183.953125, "completions/mean_terminated_length": 183.953125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.24862109124660492, "epoch": 0.6568627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.05687396399793213, "kl": 0.09466812759637833, "learning_rate": 9.582098801664443e-07, "loss": 0.0009, "num_tokens": 16801031.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.500941514968872, "sampling/importance_sampling_ratio/mean": 0.9996380805969238, "sampling/importance_sampling_ratio/min": 0.6125088334083557, "sampling/sampling_logp_difference/max": 0.4901919364929199, "sampling/sampling_logp_difference/mean": 0.0137968510389328, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 259.3125, "completions/mean_terminated_length": 259.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2820940315723419, "epoch": 0.6580882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 1.1695247246060607, "kl": 0.06301745027303696, "learning_rate": 9.579243078470378e-07, "loss": -0.0069, "num_tokens": 16838539.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6039164066314697, "sampling/importance_sampling_ratio/mean": 0.9995141625404358, "sampling/importance_sampling_ratio/min": 0.16875283420085907, "sampling/sampling_logp_difference/max": 1.779320240020752, "sampling/sampling_logp_difference/mean": 0.014724900014698505, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 219.390625, "completions/mean_terminated_length": 219.390625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.39771369099617004, "epoch": 0.6593137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.642792632565206, "kl": 0.12054871767759323, "learning_rate": 9.576378059702968e-07, "loss": 0.0329, "num_tokens": 16873700.0, "reward": 0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5629504919052124, "sampling/importance_sampling_ratio/mean": 0.9997038841247559, "sampling/importance_sampling_ratio/min": 0.5489786863327026, "sampling/sampling_logp_difference/max": 0.5996956825256348, "sampling/sampling_logp_difference/mean": 0.019365711137652397, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2835337817668915, "epoch": 0.6605392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.10563928191472281, "kl": 0.1101784035563469, "learning_rate": 9.573503751178018e-07, "loss": 0.001, "num_tokens": 16906708.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8463019132614136, "sampling/importance_sampling_ratio/mean": 0.9996192455291748, "sampling/importance_sampling_ratio/min": 0.5425854921340942, "sampling/sampling_logp_difference/max": 0.6131846904754639, "sampling/sampling_logp_difference/mean": 0.016335483640432358, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 199.859375, "completions/mean_terminated_length": 199.859375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.37571844458580017, "epoch": 0.6617647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 1.2575798644483294, "kl": 0.10863693803548813, "learning_rate": 9.570620158730194e-07, "loss": 0.0054, "num_tokens": 16946395.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7116420269012451, "sampling/importance_sampling_ratio/mean": 0.9990810751914978, "sampling/importance_sampling_ratio/min": 0.4442260265350342, "sampling/sampling_logp_difference/max": 0.8114218711853027, "sampling/sampling_logp_difference/mean": 0.0193776898086071, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 180.265625, "completions/mean_terminated_length": 180.265625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.22068551182746887, "epoch": 0.6629901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.06815042504597542, "kl": 0.08907188475131989, "learning_rate": 9.567727288213004e-07, "loss": 0.0008, "num_tokens": 16975404.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6308956146240234, "sampling/importance_sampling_ratio/mean": 1.000535011291504, "sampling/importance_sampling_ratio/min": 0.6361169815063477, "sampling/sampling_logp_difference/max": 0.48912930488586426, "sampling/sampling_logp_difference/mean": 0.013500725850462914, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 178.359375, "completions/mean_terminated_length": 178.359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2912602722644806, "epoch": 0.6642156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.8030424189206746, "kl": 0.136108860373497, "learning_rate": 9.564825145498793e-07, "loss": -0.0412, "num_tokens": 17004595.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.872259497642517, "sampling/importance_sampling_ratio/mean": 0.9996621608734131, "sampling/importance_sampling_ratio/min": 0.294066458940506, "sampling/sampling_logp_difference/max": 1.2239494323730469, "sampling/sampling_logp_difference/mean": 0.016193510964512825, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 164.578125, "completions/mean_terminated_length": 164.578125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.31142550706863403, "epoch": 0.6654411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 2.1293985363764585, "kl": 0.1263895183801651, "learning_rate": 9.561913736478728e-07, "loss": 0.0597, "num_tokens": 17035528.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996033310890198, "sampling/importance_sampling_ratio/min": 0.4838782846927643, "sampling/sampling_logp_difference/max": 0.8160734176635742, "sampling/sampling_logp_difference/mean": 0.0175727941095829, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.25655582547187805, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 1.4997438922921271, "kl": 0.07254540920257568, "learning_rate": 9.558993067062784e-07, "loss": 0.0149, "num_tokens": 17064764.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6207282543182373, "sampling/importance_sampling_ratio/mean": 1.0000991821289062, "sampling/importance_sampling_ratio/min": 0.4860598146915436, "sampling/sampling_logp_difference/max": 0.7214236259460449, "sampling/sampling_logp_difference/mean": 0.015043018385767937, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 247.46875, "completions/mean_terminated_length": 247.46875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.29953157901763916, "epoch": 0.6678921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.040827915306439845, "kl": 0.06826378405094147, "learning_rate": 9.556063143179735e-07, "loss": 0.0006, "num_tokens": 17104554.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003316402435303, "sampling/importance_sampling_ratio/min": 0.2916131317615509, "sampling/sampling_logp_difference/max": 1.2323272228240967, "sampling/sampling_logp_difference/mean": 0.01632849872112274, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 179.65625, "completions/mean_terminated_length": 179.65625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.30334335565567017, "epoch": 0.6691176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.06335981783002397, "kl": 0.10095895081758499, "learning_rate": 9.55312397077714e-07, "loss": 0.001, "num_tokens": 17132644.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006049871444702, "sampling/importance_sampling_ratio/min": 0.6039285659790039, "sampling/sampling_logp_difference/max": 0.7801837921142578, "sampling/sampling_logp_difference/mean": 0.01764393225312233, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 227.8125, "completions/mean_terminated_length": 227.8125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2977250814437866, "epoch": 0.6703431372549019, "frac_reward_zero_std": 0.75, "grad_norm": 1.503040894183228, "kl": 0.08063647150993347, "learning_rate": 9.550175555821334e-07, "loss": 0.1021, "num_tokens": 17166632.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.738738775253296, "sampling/importance_sampling_ratio/mean": 1.000267505645752, "sampling/importance_sampling_ratio/min": 0.5096349120140076, "sampling/sampling_logp_difference/max": 0.674060583114624, "sampling/sampling_logp_difference/mean": 0.015589935705065727, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 161.9375, "completions/mean_terminated_length": 161.9375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.33991408348083496, "epoch": 0.6715686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 2.2215866275205087, "kl": 0.14120438694953918, "learning_rate": 9.547217904297409e-07, "loss": -0.049, "num_tokens": 17194900.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000576972961426, "sampling/importance_sampling_ratio/min": 0.35791149735450745, "sampling/sampling_logp_difference/max": 1.0274696350097656, "sampling/sampling_logp_difference/mean": 0.01935707777738571, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 175.328125, "completions/mean_terminated_length": 175.328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3658367693424225, "epoch": 0.6727941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 2.0852632848722275, "kl": 0.11114779859781265, "learning_rate": 9.544251022209216e-07, "loss": 0.0552, "num_tokens": 17229657.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996533989906311, "sampling/importance_sampling_ratio/min": 0.37608960270881653, "sampling/sampling_logp_difference/max": 0.9779279232025146, "sampling/sampling_logp_difference/mean": 0.020111430436372757, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 168.359375, "completions/mean_terminated_length": 168.359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.22088299691677094, "epoch": 0.6740196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.05858628617393321, "kl": 0.06984078884124756, "learning_rate": 9.541274915579334e-07, "loss": 0.0007, "num_tokens": 17255808.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999707818031311, "sampling/importance_sampling_ratio/min": 0.36338913440704346, "sampling/sampling_logp_difference/max": 1.012281060218811, "sampling/sampling_logp_difference/mean": 0.013919494114816189, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 199.28125, "completions/mean_terminated_length": 199.28125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.28910544514656067, "epoch": 0.6752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.517878511161227, "kl": 0.13955840468406677, "learning_rate": 9.538289590449071e-07, "loss": 0.0013, "num_tokens": 17288018.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7042194604873657, "sampling/importance_sampling_ratio/mean": 0.9997285008430481, "sampling/importance_sampling_ratio/min": 0.4405342936515808, "sampling/sampling_logp_difference/max": 0.8197669982910156, "sampling/sampling_logp_difference/mean": 0.015938732773065567, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 213.21875, "completions/mean_terminated_length": 213.21875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.29339146614074707, "epoch": 0.6764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.048436218632843996, "kl": 0.0689179003238678, "learning_rate": 9.535295052878449e-07, "loss": 0.0007, "num_tokens": 17321552.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6922236680984497, "sampling/importance_sampling_ratio/mean": 1.0000426769256592, "sampling/importance_sampling_ratio/min": 0.5164052844047546, "sampling/sampling_logp_difference/max": 0.6608633995056152, "sampling/sampling_logp_difference/mean": 0.015585601329803467, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.21433620154857635, "epoch": 0.6776960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.07873846618363181, "kl": 0.08914574980735779, "learning_rate": 9.53229130894619e-07, "loss": 0.0009, "num_tokens": 17346672.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8140681982040405, "sampling/importance_sampling_ratio/mean": 0.999647855758667, "sampling/importance_sampling_ratio/min": 0.48965176939964294, "sampling/sampling_logp_difference/max": 0.7140607833862305, "sampling/sampling_logp_difference/mean": 0.015001345425844193, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 162.78125, "completions/mean_terminated_length": 162.78125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.2795228362083435, "epoch": 0.678921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.10242324308658463, "kl": 0.08963474631309509, "learning_rate": 9.529278364749702e-07, "loss": 0.0008, "num_tokens": 17377138.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5608521699905396, "sampling/importance_sampling_ratio/mean": 1.0004630088806152, "sampling/importance_sampling_ratio/min": 0.4826413094997406, "sampling/sampling_logp_difference/max": 0.7284815311431885, "sampling/sampling_logp_difference/mean": 0.015321934595704079, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 185.859375, "completions/mean_terminated_length": 185.859375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.32190006971359253, "epoch": 0.6801470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.380334760732346, "kl": 0.08094222843647003, "learning_rate": 9.526256226405073e-07, "loss": 0.0372, "num_tokens": 17405545.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.39084792137146, "sampling/importance_sampling_ratio/mean": 0.9996212720870972, "sampling/importance_sampling_ratio/min": 0.578363299369812, "sampling/sampling_logp_difference/max": 0.5475530624389648, "sampling/sampling_logp_difference/mean": 0.015896037220954895, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 184.546875, "completions/mean_terminated_length": 184.546875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.27940815687179565, "epoch": 0.6813725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.246729190634135, "kl": 0.08324264734983444, "learning_rate": 9.523224900047051e-07, "loss": 0.0589, "num_tokens": 17431420.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.655898928642273, "sampling/importance_sampling_ratio/mean": 1.0001144409179688, "sampling/importance_sampling_ratio/min": 0.48278486728668213, "sampling/sampling_logp_difference/max": 0.728184163570404, "sampling/sampling_logp_difference/mean": 0.015265392139554024, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 171.71875, "completions/mean_terminated_length": 171.71875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2746974527835846, "epoch": 0.6825980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 2.0743775941752993, "kl": 0.18216821551322937, "learning_rate": 9.520184391829036e-07, "loss": 0.0021, "num_tokens": 17463418.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6304324865341187, "sampling/importance_sampling_ratio/mean": 0.9998118877410889, "sampling/importance_sampling_ratio/min": 0.5492480993270874, "sampling/sampling_logp_difference/max": 0.5992050170898438, "sampling/sampling_logp_difference/mean": 0.015748387202620506, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 180.515625, "completions/mean_terminated_length": 180.515625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.32589060068130493, "epoch": 0.6838235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.6425778279453724, "kl": 0.0704319179058075, "learning_rate": 9.517134707923069e-07, "loss": -0.0216, "num_tokens": 17495099.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.7607169151306152, "sampling/importance_sampling_ratio/mean": 1.000322937965393, "sampling/importance_sampling_ratio/min": 0.29270172119140625, "sampling/sampling_logp_difference/max": 1.228601098060608, "sampling/sampling_logp_difference/mean": 0.017600977793335915, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 181.765625, "completions/mean_terminated_length": 181.765625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23727193474769592, "epoch": 0.6850490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.05431430416722844, "kl": 0.07662207633256912, "learning_rate": 9.514075854519813e-07, "loss": 0.0007, "num_tokens": 17523196.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.715319275856018, "sampling/importance_sampling_ratio/mean": 0.9998581409454346, "sampling/importance_sampling_ratio/min": 0.5144228339195251, "sampling/sampling_logp_difference/max": 0.6647096872329712, "sampling/sampling_logp_difference/mean": 0.01397109217941761, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 166.921875, "completions/mean_terminated_length": 166.921875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3472491502761841, "epoch": 0.6862745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 2.457981257648886, "kl": 0.10838024318218231, "learning_rate": 9.511007837828548e-07, "loss": -0.0213, "num_tokens": 17556007.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.957856297492981, "sampling/importance_sampling_ratio/mean": 1.0007283687591553, "sampling/importance_sampling_ratio/min": 0.5365152359008789, "sampling/sampling_logp_difference/max": 0.6718502044677734, "sampling/sampling_logp_difference/mean": 0.017687536776065826, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 252.296875, "completions/mean_terminated_length": 252.296875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3433667719364166, "epoch": 0.6875, "frac_reward_zero_std": 0.75, "grad_norm": 1.1470517825611433, "kl": 0.0592065192759037, "learning_rate": 9.507930664077153e-07, "loss": -0.0069, "num_tokens": 17595114.0, "reward": -0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9486160278320312, "sampling/importance_sampling_ratio/mean": 1.0003085136413574, "sampling/importance_sampling_ratio/min": 0.5001212358474731, "sampling/sampling_logp_difference/max": 0.6929047107696533, "sampling/sampling_logp_difference/mean": 0.01738228276371956, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 183.921875, "completions/mean_terminated_length": 183.921875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2436867654323578, "epoch": 0.6887254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.05303062558787955, "kl": 0.07596056163311005, "learning_rate": 9.504844339512094e-07, "loss": 0.0007, "num_tokens": 17626421.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009114742279053, "sampling/importance_sampling_ratio/min": 0.29601848125457764, "sampling/sampling_logp_difference/max": 1.3902349472045898, "sampling/sampling_logp_difference/mean": 0.01490679569542408, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 172.3125, "completions/mean_terminated_length": 172.3125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.32237541675567627, "epoch": 0.6899509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 2.237576665563323, "kl": 0.10190116614103317, "learning_rate": 9.501748870398419e-07, "loss": 0.0389, "num_tokens": 17654809.0, "reward": -0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.689343810081482, "sampling/importance_sampling_ratio/mean": 0.9994897246360779, "sampling/importance_sampling_ratio/min": 0.482289582490921, "sampling/sampling_logp_difference/max": 0.729210615158081, "sampling/sampling_logp_difference/mean": 0.017055770382285118, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 189.640625, "completions/mean_terminated_length": 189.640625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.1565149873495102, "epoch": 0.6911764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.4900637771554077, "kl": 0.059271518141031265, "learning_rate": 9.498644263019731e-07, "loss": 0.0301, "num_tokens": 17686738.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5005675554275513, "sampling/importance_sampling_ratio/mean": 0.9995347261428833, "sampling/importance_sampling_ratio/min": 0.1834573894739151, "sampling/sampling_logp_difference/max": 1.6957728862762451, "sampling/sampling_logp_difference/mean": 0.011621063575148582, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 176.0625, "completions/mean_terminated_length": 176.0625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4577755033969879, "epoch": 0.6924019607843137, "frac_reward_zero_std": 0.0, "grad_norm": 3.612417444906285, "kl": 0.18480560183525085, "learning_rate": 9.495530523678186e-07, "loss": -0.0051, "num_tokens": 17717606.0, "reward": 0.0625, "reward_std": 0.9955304861068726, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7520650625228882, "sampling/importance_sampling_ratio/mean": 1.000302791595459, "sampling/importance_sampling_ratio/min": 0.6144919395446777, "sampling/sampling_logp_difference/max": 0.5607950687408447, "sampling/sampling_logp_difference/mean": 0.02176954783499241, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 188.953125, "completions/mean_terminated_length": 188.953125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2748880982398987, "epoch": 0.6936274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.5379158429291662, "kl": 0.08798833191394806, "learning_rate": 9.492407658694477e-07, "loss": 0.0107, "num_tokens": 17745587.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000697374343872, "sampling/importance_sampling_ratio/min": 0.6073539853096008, "sampling/sampling_logp_difference/max": 1.2174501419067383, "sampling/sampling_logp_difference/mean": 0.016703777015209198, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 170.4375, "completions/mean_terminated_length": 170.4375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2802906632423401, "epoch": 0.6948529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 2.1508284482199413, "kl": 0.09377346932888031, "learning_rate": 9.489275674407825e-07, "loss": -0.015, "num_tokens": 17773279.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.595808982849121, "sampling/importance_sampling_ratio/mean": 0.9997985363006592, "sampling/importance_sampling_ratio/min": 0.5319787859916687, "sampling/sampling_logp_difference/max": 0.6311516761779785, "sampling/sampling_logp_difference/mean": 0.015627920627593994, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 142.046875, "completions/mean_terminated_length": 142.046875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.20774143934249878, "epoch": 0.696078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.06676551202190348, "kl": 0.06828676909208298, "learning_rate": 9.486134577175957e-07, "loss": 0.0006, "num_tokens": 17796754.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5373115539550781, "sampling/importance_sampling_ratio/mean": 1.0002179145812988, "sampling/importance_sampling_ratio/min": 0.6622359156608582, "sampling/sampling_logp_difference/max": 0.4300351142883301, "sampling/sampling_logp_difference/mean": 0.013358856551349163, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 198.96875, "completions/mean_terminated_length": 198.96875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3348020017147064, "epoch": 0.6973039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.6026369656150075, "kl": 0.08831897377967834, "learning_rate": 9.482984373375104e-07, "loss": 0.036, "num_tokens": 17829024.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999412298202515, "sampling/importance_sampling_ratio/min": 0.600390613079071, "sampling/sampling_logp_difference/max": 0.930025577545166, "sampling/sampling_logp_difference/mean": 0.018753811717033386, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3299071788787842, "epoch": 0.6985294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 2.1039569568245162, "kl": 0.12124665826559067, "learning_rate": 9.479825069399977e-07, "loss": 0.0055, "num_tokens": 17858568.0, "reward": 0.03125, "reward_std": 0.5143726468086243, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9641410112380981, "sampling/importance_sampling_ratio/mean": 1.00050950050354, "sampling/importance_sampling_ratio/min": 0.290988564491272, "sampling/sampling_logp_difference/max": 1.234471321105957, "sampling/sampling_logp_difference/mean": 0.01924917846918106, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 171.0625, "completions/mean_terminated_length": 171.0625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.28250452876091003, "epoch": 0.6997549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.7806336214041554, "kl": 0.09873203933238983, "learning_rate": 9.476656671663766e-07, "loss": -0.0054, "num_tokens": 17891580.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.8723371028900146, "sampling/importance_sampling_ratio/mean": 0.9993788599967957, "sampling/importance_sampling_ratio/min": 0.6087372899055481, "sampling/sampling_logp_difference/max": 0.6271874904632568, "sampling/sampling_logp_difference/mean": 0.01713189110159874, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 182.96875, "completions/mean_terminated_length": 182.96875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2556510269641876, "epoch": 0.7009803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.5571421400333714, "kl": 0.07203780114650726, "learning_rate": 9.473479186598114e-07, "loss": -0.0093, "num_tokens": 17919642.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000793218612671, "sampling/importance_sampling_ratio/min": 0.5747721791267395, "sampling/sampling_logp_difference/max": 0.9725565910339355, "sampling/sampling_logp_difference/mean": 0.017283430323004723, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 195.015625, "completions/mean_terminated_length": 195.015625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22532916069030762, "epoch": 0.7022058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.068594738552317, "kl": 0.06998701393604279, "learning_rate": 9.470292620653119e-07, "loss": -0.0038, "num_tokens": 17949115.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8179765939712524, "sampling/importance_sampling_ratio/mean": 1.0004429817199707, "sampling/importance_sampling_ratio/min": 0.4138515293598175, "sampling/sampling_logp_difference/max": 0.882248044013977, "sampling/sampling_logp_difference/mean": 0.013591472990810871, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 214.078125, "completions/mean_terminated_length": 214.078125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.37085291743278503, "epoch": 0.7034313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.3661480693610237, "kl": 0.10367009043693542, "learning_rate": 9.467096980297304e-07, "loss": 0.0438, "num_tokens": 17980880.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009095668792725, "sampling/importance_sampling_ratio/min": 0.32664066553115845, "sampling/sampling_logp_difference/max": 1.1188945770263672, "sampling/sampling_logp_difference/mean": 0.01959863491356373, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.37925583124160767, "epoch": 0.7046568627450981, "frac_reward_zero_std": 0.0, "grad_norm": 3.0154665755784262, "kl": 0.13310685753822327, "learning_rate": 9.463892272017618e-07, "loss": 0.0451, "num_tokens": 18011768.0, "reward": 0.5625, "reward_std": 0.784286618232727, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999893307685852, "sampling/importance_sampling_ratio/min": 0.4033031761646271, "sampling/sampling_logp_difference/max": 0.9080667495727539, "sampling/sampling_logp_difference/mean": 0.01956643909215927, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 176.171875, "completions/mean_terminated_length": 176.171875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.27574622631073, "epoch": 0.7058823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 1.8423241448140895, "kl": 0.07914084196090698, "learning_rate": 9.460678502319416e-07, "loss": 0.0201, "num_tokens": 18038899.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000700831413269, "sampling/importance_sampling_ratio/min": 0.27442094683647156, "sampling/sampling_logp_difference/max": 1.5977290868759155, "sampling/sampling_logp_difference/mean": 0.01649722456932068, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 236.28125, "completions/mean_terminated_length": 236.28125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.33270812034606934, "epoch": 0.7071078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.830751486801874, "kl": 0.08072858303785324, "learning_rate": 9.457455677726447e-07, "loss": -0.0964, "num_tokens": 18075173.0, "reward": -0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.654782772064209, "sampling/importance_sampling_ratio/mean": 0.9999206066131592, "sampling/importance_sampling_ratio/min": 0.5717520713806152, "sampling/sampling_logp_difference/max": 0.5590498447418213, "sampling/sampling_logp_difference/mean": 0.01706099882721901, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 209.65625, "completions/mean_terminated_length": 209.65625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2930196225643158, "epoch": 0.7083333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 1.2614592752394247, "kl": 0.07866720855236053, "learning_rate": 9.454223804780841e-07, "loss": -0.0029, "num_tokens": 18106719.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.815672755241394, "sampling/importance_sampling_ratio/mean": 1.0005232095718384, "sampling/importance_sampling_ratio/min": 0.07906536757946014, "sampling/sampling_logp_difference/max": 2.537480354309082, "sampling/sampling_logp_difference/mean": 0.017369206994771957, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3268337547779083, "epoch": 0.7095588235294118, "frac_reward_zero_std": 0.25, "grad_norm": 3.4297343599552055, "kl": 0.0850410908460617, "learning_rate": 9.450982890043094e-07, "loss": 0.0429, "num_tokens": 18139391.0, "reward": -0.28125, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000206708908081, "sampling/importance_sampling_ratio/min": 0.1372629851102829, "sampling/sampling_logp_difference/max": 1.985856533050537, "sampling/sampling_logp_difference/mean": 0.01785438321530819, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 248.6875, "completions/mean_terminated_length": 248.6875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.30297189950942993, "epoch": 0.7107843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.019871671926416, "kl": 0.08185930550098419, "learning_rate": 9.447732940092059e-07, "loss": -0.0279, "num_tokens": 18176011.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8411680459976196, "sampling/importance_sampling_ratio/mean": 0.9997085332870483, "sampling/importance_sampling_ratio/min": 0.23401731252670288, "sampling/sampling_logp_difference/max": 1.4523601531982422, "sampling/sampling_logp_difference/mean": 0.017601585015654564, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 213.140625, "completions/mean_terminated_length": 213.140625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2970901131629944, "epoch": 0.7120098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 1.5351385312250692, "kl": 0.07222087681293488, "learning_rate": 9.444473961524927e-07, "loss": -0.0141, "num_tokens": 18218756.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6027271747589111, "sampling/importance_sampling_ratio/mean": 0.9998239874839783, "sampling/importance_sampling_ratio/min": 0.47394949197769165, "sampling/sampling_logp_difference/max": 0.7466545104980469, "sampling/sampling_logp_difference/mean": 0.018158020451664925, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 218.140625, "completions/mean_terminated_length": 218.140625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2991441488265991, "epoch": 0.7132352941176471, "frac_reward_zero_std": 0.25, "grad_norm": 2.1461756462892447, "kl": 0.0762210264801979, "learning_rate": 9.441205960957219e-07, "loss": 0.0535, "num_tokens": 18252589.0, "reward": 0.125, "reward_std": 0.6047805547714233, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005971193313599, "sampling/importance_sampling_ratio/min": 0.41775795817375183, "sampling/sampling_logp_difference/max": 0.8728530406951904, "sampling/sampling_logp_difference/mean": 0.017642326653003693, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 165.265625, "completions/mean_terminated_length": 165.265625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2318461537361145, "epoch": 0.7144607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.7609467454165173, "kl": 0.06704086065292358, "learning_rate": 9.43792894502277e-07, "loss": -0.019, "num_tokens": 18279614.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5931954383850098, "sampling/importance_sampling_ratio/mean": 0.999550461769104, "sampling/importance_sampling_ratio/min": 0.35211485624313354, "sampling/sampling_logp_difference/max": 1.0437978506088257, "sampling/sampling_logp_difference/mean": 0.01575883850455284, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 193.921875, "completions/mean_terminated_length": 193.921875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.33422935009002686, "epoch": 0.7156862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.4824358414655476, "kl": 0.11342482268810272, "learning_rate": 9.434642920373713e-07, "loss": -0.0443, "num_tokens": 18313945.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004544258117676, "sampling/importance_sampling_ratio/min": 0.3287467062473297, "sampling/sampling_logp_difference/max": 1.1124677658081055, "sampling/sampling_logp_difference/mean": 0.022340642288327217, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 186.609375, "completions/mean_terminated_length": 186.609375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2566809058189392, "epoch": 0.7169117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 4.651125667788851, "kl": 0.17529572546482086, "learning_rate": 9.431347893680472e-07, "loss": -0.105, "num_tokens": 18340672.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994093179702759, "sampling/importance_sampling_ratio/min": 0.1798427850008011, "sampling/sampling_logp_difference/max": 1.715672254562378, "sampling/sampling_logp_difference/mean": 0.018325788900256157, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 199.890625, "completions/mean_terminated_length": 199.890625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.16830769181251526, "epoch": 0.7181372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.4054503752299314, "kl": 0.04657375067472458, "learning_rate": 9.428043871631739e-07, "loss": 0.0256, "num_tokens": 18369193.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998905658721924, "sampling/importance_sampling_ratio/min": 0.3609904646873474, "sampling/sampling_logp_difference/max": 1.0189037322998047, "sampling/sampling_logp_difference/mean": 0.011816881597042084, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 198.078125, "completions/mean_terminated_length": 198.078125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3127084970474243, "epoch": 0.7193627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 2.3133259272413325, "kl": 0.07372134178876877, "learning_rate": 9.424730860934472e-07, "loss": -0.0365, "num_tokens": 18405102.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7537305355072021, "sampling/importance_sampling_ratio/mean": 1.000502586364746, "sampling/importance_sampling_ratio/min": 0.47892704606056213, "sampling/sampling_logp_difference/max": 0.7362070083618164, "sampling/sampling_logp_difference/mean": 0.01948605105280876, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 161.96875, "completions/mean_terminated_length": 161.96875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2641754150390625, "epoch": 0.7205882352941176, "frac_reward_zero_std": 0.25, "grad_norm": 3.012368980612821, "kl": 0.12216442823410034, "learning_rate": 9.421408868313873e-07, "loss": -0.0231, "num_tokens": 18429532.0, "reward": 0.0625, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7469195127487183, "sampling/importance_sampling_ratio/mean": 1.0002282857894897, "sampling/importance_sampling_ratio/min": 0.5910896062850952, "sampling/sampling_logp_difference/max": 0.5578539371490479, "sampling/sampling_logp_difference/mean": 0.016571495682001114, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.201697438955307, "epoch": 0.7218137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 2.2828015128304315, "kl": 0.06914472579956055, "learning_rate": 9.418077900513376e-07, "loss": 0.0625, "num_tokens": 18458604.0, "reward": 0.46875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5680956840515137, "sampling/importance_sampling_ratio/mean": 1.0003582239151, "sampling/importance_sampling_ratio/min": 0.5520696043968201, "sampling/sampling_logp_difference/max": 0.5940811634063721, "sampling/sampling_logp_difference/mean": 0.014866025187075138, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 181.921875, "completions/mean_terminated_length": 181.921875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2416316270828247, "epoch": 0.7230392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.4748538219743468, "kl": 0.05072364956140518, "learning_rate": 9.414737964294634e-07, "loss": -0.0024, "num_tokens": 18491047.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9876954555511475, "sampling/importance_sampling_ratio/mean": 1.0002965927124023, "sampling/importance_sampling_ratio/min": 0.5677002668380737, "sampling/sampling_logp_difference/max": 0.6869759559631348, "sampling/sampling_logp_difference/mean": 0.016352690756320953, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 207.78125, "completions/mean_terminated_length": 207.78125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2534104585647583, "epoch": 0.7242647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 1.7327769190709892, "kl": 0.1017884761095047, "learning_rate": 9.411389066437507e-07, "loss": -0.0358, "num_tokens": 18525577.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7781813144683838, "sampling/importance_sampling_ratio/mean": 1.0002974271774292, "sampling/importance_sampling_ratio/min": 0.46167972683906555, "sampling/sampling_logp_difference/max": 0.7728838920593262, "sampling/sampling_logp_difference/mean": 0.018235966563224792, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.24245025217533112, "epoch": 0.7254901960784313, "frac_reward_zero_std": 0.25, "grad_norm": 3.0873627688625596, "kl": 0.09672357141971588, "learning_rate": 9.408031213740044e-07, "loss": -0.1126, "num_tokens": 18553737.0, "reward": 0.1875, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.8540359735488892, "sampling/importance_sampling_ratio/mean": 1.000643253326416, "sampling/importance_sampling_ratio/min": 0.5420742630958557, "sampling/sampling_logp_difference/max": 0.6173648834228516, "sampling/sampling_logp_difference/mean": 0.01740797609090805, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 187.9375, "completions/mean_terminated_length": 187.9375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2418367564678192, "epoch": 0.7267156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 2.1264170023692373, "kl": 0.0872611477971077, "learning_rate": 9.404664413018476e-07, "loss": 0.0105, "num_tokens": 18587685.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.817976713180542, "sampling/importance_sampling_ratio/mean": 0.9995722770690918, "sampling/importance_sampling_ratio/min": 0.3772679567337036, "sampling/sampling_logp_difference/max": 0.974799633026123, "sampling/sampling_logp_difference/mean": 0.01760914921760559, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 277.140625, "completions/mean_terminated_length": 277.140625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.25757867097854614, "epoch": 0.7279411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 1.101525224955409, "kl": 0.047139450907707214, "learning_rate": 9.401288671107193e-07, "loss": -0.036, "num_tokens": 18625790.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995555281639099, "sampling/importance_sampling_ratio/min": 0.1641063690185547, "sampling/sampling_logp_difference/max": 1.8072404861450195, "sampling/sampling_logp_difference/mean": 0.016724945977330208, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 187.765625, "completions/mean_terminated_length": 187.765625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2661805748939514, "epoch": 0.7291666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 1.2565356284367513, "kl": 0.10957401990890503, "learning_rate": 9.397903994858735e-07, "loss": -0.0139, "num_tokens": 18657599.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8815131187438965, "sampling/importance_sampling_ratio/mean": 1.000257968902588, "sampling/importance_sampling_ratio/min": 0.5219682455062866, "sampling/sampling_logp_difference/max": 0.6501485109329224, "sampling/sampling_logp_difference/mean": 0.01742507517337799, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 200.90625, "completions/mean_terminated_length": 200.90625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.22445990145206451, "epoch": 0.7303921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.823977890106038, "kl": 0.0618349090218544, "learning_rate": 9.394510391143786e-07, "loss": -0.0216, "num_tokens": 18685913.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001581907272339, "sampling/importance_sampling_ratio/min": 0.4787578880786896, "sampling/sampling_logp_difference/max": 0.7520298957824707, "sampling/sampling_logp_difference/mean": 0.016777168959379196, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 228.0625, "completions/mean_terminated_length": 228.0625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.1810968518257141, "epoch": 0.7316176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.049935742312966276, "kl": 0.040459342300891876, "learning_rate": 9.391107866851142e-07, "loss": 0.0004, "num_tokens": 18733485.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7553400993347168, "sampling/importance_sampling_ratio/mean": 1.0002613067626953, "sampling/importance_sampling_ratio/min": 0.3586792051792145, "sampling/sampling_logp_difference/max": 1.0253269672393799, "sampling/sampling_logp_difference/mean": 0.01264331303536892, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 182.046875, "completions/mean_terminated_length": 182.046875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3017774820327759, "epoch": 0.7328431372549019, "frac_reward_zero_std": 0.5, "grad_norm": 2.256661925458094, "kl": 0.10925137996673584, "learning_rate": 9.387696428887715e-07, "loss": 0.0262, "num_tokens": 18759536.0, "reward": -0.5625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99953293800354, "sampling/importance_sampling_ratio/min": 0.4693239629268646, "sampling/sampling_logp_difference/max": 0.7564619779586792, "sampling/sampling_logp_difference/mean": 0.01892591454088688, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 156.53125, "completions/mean_terminated_length": 156.53125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.21708914637565613, "epoch": 0.7340686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 2.047905324272125, "kl": 0.10377472639083862, "learning_rate": 9.384276084178504e-07, "loss": 0.045, "num_tokens": 18783746.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6252808570861816, "sampling/importance_sampling_ratio/mean": 0.9993909597396851, "sampling/importance_sampling_ratio/min": 0.37504148483276367, "sampling/sampling_logp_difference/max": 0.9807186126708984, "sampling/sampling_logp_difference/mean": 0.016367195174098015, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 210.203125, "completions/mean_terminated_length": 210.203125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.32918208837509155, "epoch": 0.7352941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 2.0687708861778114, "kl": 0.10704650729894638, "learning_rate": 9.380846839666595e-07, "loss": -0.0631, "num_tokens": 18830031.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7516393661499023, "sampling/importance_sampling_ratio/mean": 1.0006492137908936, "sampling/importance_sampling_ratio/min": 0.5185782313346863, "sampling/sampling_logp_difference/max": 0.6566643714904785, "sampling/sampling_logp_difference/mean": 0.01922888122498989, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 185.28125, "completions/mean_terminated_length": 185.28125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2313532829284668, "epoch": 0.7365196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 1.8540450253251732, "kl": 0.11107133328914642, "learning_rate": 9.377408702313136e-07, "loss": -0.0061, "num_tokens": 18857905.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002211332321167, "sampling/importance_sampling_ratio/min": 0.4707980751991272, "sampling/sampling_logp_difference/max": 1.8231003284454346, "sampling/sampling_logp_difference/mean": 0.016610242426395416, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 174.109375, "completions/mean_terminated_length": 174.109375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.15651580691337585, "epoch": 0.7377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.1056531568919814, "kl": 0.07144558429718018, "learning_rate": 9.37396167909733e-07, "loss": 0.0007, "num_tokens": 18887800.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001873970031738, "sampling/importance_sampling_ratio/min": 0.34135210514068604, "sampling/sampling_logp_difference/max": 1.074840784072876, "sampling/sampling_logp_difference/mean": 0.011767935007810593, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 176.703125, "completions/mean_terminated_length": 176.703125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.232400581240654, "epoch": 0.7389705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 3.4091177250320435, "kl": 0.09642212837934494, "learning_rate": 9.370505777016413e-07, "loss": -0.0364, "num_tokens": 18915429.0, "reward": 0.21875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009019374847412, "sampling/importance_sampling_ratio/min": 0.44574272632598877, "sampling/sampling_logp_difference/max": 1.4517979621887207, "sampling/sampling_logp_difference/mean": 0.018499933183193207, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 221.734375, "completions/mean_terminated_length": 221.734375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.22668619453907013, "epoch": 0.7401960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.3208188812787982, "kl": 0.06442856788635254, "learning_rate": 9.367041003085648e-07, "loss": 0.0001, "num_tokens": 18948612.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5386793613433838, "sampling/importance_sampling_ratio/mean": 0.9998711347579956, "sampling/importance_sampling_ratio/min": 0.30428776144981384, "sampling/sampling_logp_difference/max": 1.1897814273834229, "sampling/sampling_logp_difference/mean": 0.0150670874863863, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 247.0625, "completions/mean_terminated_length": 247.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.2504423260688782, "epoch": 0.741421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.06447632382614953, "kl": 0.054915666580200195, "learning_rate": 9.363567364338307e-07, "loss": 0.0005, "num_tokens": 18986248.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994238615036011, "sampling/importance_sampling_ratio/min": 0.3814408779144287, "sampling/sampling_logp_difference/max": 0.9637994766235352, "sampling/sampling_logp_difference/mean": 0.017830682918429375, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 199.828125, "completions/mean_terminated_length": 199.828125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.20959287881851196, "epoch": 0.7426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.6888387770785338, "kl": 0.07850334048271179, "learning_rate": 9.360084867825658e-07, "loss": 0.0147, "num_tokens": 19018877.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003209114074707, "sampling/importance_sampling_ratio/min": 0.4849974513053894, "sampling/sampling_logp_difference/max": 1.4400253295898438, "sampling/sampling_logp_difference/mean": 0.015034351497888565, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 324.0, "completions/mean_terminated_length": 324.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2272258996963501, "epoch": 0.7438725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 1.296258183615557, "kl": 0.053450606763362885, "learning_rate": 9.356593520616946e-07, "loss": 0.0356, "num_tokens": 19068989.0, "reward": -0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": -0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999680757522583, "sampling/importance_sampling_ratio/min": 0.397955060005188, "sampling/sampling_logp_difference/max": 1.0813994407653809, "sampling/sampling_logp_difference/mean": 0.01458294689655304, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.21361863613128662, "epoch": 0.7450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.07297670675866863, "kl": 0.06689755618572235, "learning_rate": 9.353093329799386e-07, "loss": 0.0007, "num_tokens": 19097077.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999480247497559, "sampling/importance_sampling_ratio/min": 0.5174911022186279, "sampling/sampling_logp_difference/max": 0.8329944610595703, "sampling/sampling_logp_difference/mean": 0.013164438307285309, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 231.71875, "completions/mean_terminated_length": 231.71875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.16736982762813568, "epoch": 0.7463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.07469319168022485, "kl": 0.0580403134226799, "learning_rate": 9.349584302478144e-07, "loss": 0.0005, "num_tokens": 19131683.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001057386398315, "sampling/importance_sampling_ratio/min": 0.5198130011558533, "sampling/sampling_logp_difference/max": 0.6996743679046631, "sampling/sampling_logp_difference/mean": 0.012384508736431599, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 231.390625, "completions/mean_terminated_length": 231.390625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.17477071285247803, "epoch": 0.7475490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.20948898139056, "kl": 0.07928837090730667, "learning_rate": 9.346066445776321e-07, "loss": 0.0212, "num_tokens": 19164636.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7640935182571411, "sampling/importance_sampling_ratio/mean": 0.9995098114013672, "sampling/importance_sampling_ratio/min": 0.5353386402130127, "sampling/sampling_logp_difference/max": 0.6248557567596436, "sampling/sampling_logp_difference/mean": 0.011636776849627495, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 271.4375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.18699999153614044, "epoch": 0.7487745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.224725062560454, "kl": 0.07661756873130798, "learning_rate": 9.342539766834945e-07, "loss": 0.0342, "num_tokens": 19198888.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.931645393371582, "sampling/importance_sampling_ratio/mean": 0.9998631477355957, "sampling/importance_sampling_ratio/min": 0.3853294253349304, "sampling/sampling_logp_difference/max": 0.9536566734313965, "sampling/sampling_logp_difference/mean": 0.013207919895648956, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 232.734375, "completions/mean_terminated_length": 232.734375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.1946050077676773, "epoch": 0.75, "frac_reward_zero_std": 0.5, "grad_norm": 1.8619925595998692, "kl": 0.08101022243499756, "learning_rate": 9.339004272812949e-07, "loss": -0.0113, "num_tokens": 19233431.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.9876788854599, "sampling/importance_sampling_ratio/mean": 0.9999079704284668, "sampling/importance_sampling_ratio/min": 0.14849144220352173, "sampling/sampling_logp_difference/max": 1.9072279930114746, "sampling/sampling_logp_difference/mean": 0.015206235460937023, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 240.109375, "completions/mean_terminated_length": 240.109375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.20316070318222046, "epoch": 0.7512254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.26616001363097, "kl": 0.06727971136569977, "learning_rate": 9.335459970887165e-07, "loss": 0.0262, "num_tokens": 19265662.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.620835304260254, "sampling/importance_sampling_ratio/mean": 1.000004529953003, "sampling/importance_sampling_ratio/min": 0.5038020610809326, "sampling/sampling_logp_difference/max": 0.6855719089508057, "sampling/sampling_logp_difference/mean": 0.013958821073174477, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 265.4375, "completions/mean_terminated_length": 265.4375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2019714117050171, "epoch": 0.7524509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.06056981560619988, "kl": 0.04912319406867027, "learning_rate": 9.331906868252299e-07, "loss": 0.0004, "num_tokens": 19303882.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6355785131454468, "sampling/importance_sampling_ratio/mean": 0.9992480874061584, "sampling/importance_sampling_ratio/min": 0.28295519948005676, "sampling/sampling_logp_difference/max": 1.2624666690826416, "sampling/sampling_logp_difference/mean": 0.01610996387898922, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 265.890625, "completions/mean_terminated_length": 265.890625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1941600888967514, "epoch": 0.7536764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.07376953339828347, "kl": 0.053942590951919556, "learning_rate": 9.328344972120925e-07, "loss": 0.0005, "num_tokens": 19341299.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6283336877822876, "sampling/importance_sampling_ratio/mean": 0.9999680519104004, "sampling/importance_sampling_ratio/min": 0.267314612865448, "sampling/sampling_logp_difference/max": 1.319329023361206, "sampling/sampling_logp_difference/mean": 0.013370128348469734, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 282.1875, "completions/mean_terminated_length": 282.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2386167347431183, "epoch": 0.7549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.103684839254737, "kl": 0.04838567599654198, "learning_rate": 9.324774289723467e-07, "loss": 0.0036, "num_tokens": 19380063.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9849047660827637, "sampling/importance_sampling_ratio/mean": 0.999728798866272, "sampling/importance_sampling_ratio/min": 0.5693269968032837, "sampling/sampling_logp_difference/max": 0.6855709552764893, "sampling/sampling_logp_difference/mean": 0.013577599078416824, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 256.90625, "completions/mean_terminated_length": 256.90625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.16752922534942627, "epoch": 0.7561274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.05594665590122851, "kl": 0.058025140315294266, "learning_rate": 9.321194828308183e-07, "loss": 0.0005, "num_tokens": 19411161.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994553327560425, "sampling/importance_sampling_ratio/min": 0.31814250349998474, "sampling/sampling_logp_difference/max": 1.1452558040618896, "sampling/sampling_logp_difference/mean": 0.01149724330753088, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 224.125, "completions/mean_terminated_length": 224.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.22526061534881592, "epoch": 0.7573529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.1830682491617488, "kl": 0.0643705353140831, "learning_rate": 9.317606595141155e-07, "loss": -0.0464, "num_tokens": 19442737.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997191429138184, "sampling/importance_sampling_ratio/min": 0.056884463876485825, "sampling/sampling_logp_difference/max": 2.8667330741882324, "sampling/sampling_logp_difference/mean": 0.015809137374162674, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 256.1875, "completions/mean_terminated_length": 256.1875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.22948342561721802, "epoch": 0.758578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.8439502527324416, "kl": 0.0796620324254036, "learning_rate": 9.314009597506265e-07, "loss": 0.1217, "num_tokens": 19473085.0, "reward": 0.5, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000457763671875, "sampling/importance_sampling_ratio/min": 0.3986046314239502, "sampling/sampling_logp_difference/max": 0.9197852611541748, "sampling/sampling_logp_difference/mean": 0.014308687299489975, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 257.828125, "completions/mean_terminated_length": 257.828125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.25681865215301514, "epoch": 0.7598039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.1337875133983972, "kl": 0.0655963122844696, "learning_rate": 9.310403842705194e-07, "loss": -0.016, "num_tokens": 19505826.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9782131910324097, "sampling/importance_sampling_ratio/mean": 1.0000708103179932, "sampling/importance_sampling_ratio/min": 0.45821645855903625, "sampling/sampling_logp_difference/max": 0.7804136276245117, "sampling/sampling_logp_difference/mean": 0.016288483515381813, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 333.453125, "completions/mean_terminated_length": 333.453125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.27068203687667847, "epoch": 0.7610294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.3486866936154918, "kl": 0.06823822855949402, "learning_rate": 9.306789338057393e-07, "loss": -0.0288, "num_tokens": 19548063.0, "reward": 0.0, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999840497970581, "sampling/importance_sampling_ratio/min": 0.38916364312171936, "sampling/sampling_logp_difference/max": 0.9877901077270508, "sampling/sampling_logp_difference/mean": 0.016273140907287598, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.18036212027072906, "epoch": 0.7622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.14617592910948715, "kl": 0.05190131440758705, "learning_rate": 9.303166090900081e-07, "loss": 0.0004, "num_tokens": 19579535.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9100128412246704, "sampling/importance_sampling_ratio/mean": 1.000030279159546, "sampling/importance_sampling_ratio/min": 0.250815212726593, "sampling/sampling_logp_difference/max": 1.3830387592315674, "sampling/sampling_logp_difference/mean": 0.01475166529417038, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 272.34375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.21046610176563263, "epoch": 0.7634803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.2400533275142525, "kl": 0.0655660331249237, "learning_rate": 9.299534108588217e-07, "loss": -0.002, "num_tokens": 19614485.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.9693632125854492, "sampling/importance_sampling_ratio/mean": 1.0004092454910278, "sampling/importance_sampling_ratio/min": 0.5376322865486145, "sampling/sampling_logp_difference/max": 0.6777102947235107, "sampling/sampling_logp_difference/mean": 0.013047540560364723, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 321.328125, "completions/mean_terminated_length": 321.328125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.15701323747634888, "epoch": 0.7647058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.043994570241456525, "kl": 0.03948718309402466, "learning_rate": 9.295893398494497e-07, "loss": 0.0004, "num_tokens": 19653674.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996031522750854, "sampling/importance_sampling_ratio/min": 0.4808013141155243, "sampling/sampling_logp_difference/max": 2.3391895294189453, "sampling/sampling_logp_difference/mean": 0.010676901787519455, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 216.390625, "completions/mean_terminated_length": 216.390625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.20857155323028564, "epoch": 0.7659313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.10756971013522826, "kl": 0.09376202523708344, "learning_rate": 9.29224396800933e-07, "loss": 0.0009, "num_tokens": 19683651.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6223664283752441, "sampling/importance_sampling_ratio/mean": 0.9997443556785583, "sampling/importance_sampling_ratio/min": 0.5096457600593567, "sampling/sampling_logp_difference/max": 0.674039363861084, "sampling/sampling_logp_difference/mean": 0.01420876570045948, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 229.78125, "completions/mean_terminated_length": 229.78125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.22043286263942719, "epoch": 0.7671568627450981, "frac_reward_zero_std": 0.75, "grad_norm": 1.3501048673191143, "kl": 0.07209709286689758, "learning_rate": 9.288585824540832e-07, "loss": 0.0099, "num_tokens": 19717621.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.9848257303237915, "sampling/importance_sampling_ratio/mean": 0.9999575018882751, "sampling/importance_sampling_ratio/min": 0.481048047542572, "sampling/sampling_logp_difference/max": 0.731788158416748, "sampling/sampling_logp_difference/mean": 0.014640593901276588, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 151.734375, "completions/mean_terminated_length": 151.734375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.1915517896413803, "epoch": 0.7683823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.08727131737687056, "kl": 0.08875685185194016, "learning_rate": 9.284918975514797e-07, "loss": 0.0009, "num_tokens": 19741828.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.804423213005066, "sampling/importance_sampling_ratio/mean": 0.9999748468399048, "sampling/importance_sampling_ratio/min": 0.495414674282074, "sampling/sampling_logp_difference/max": 0.7023601531982422, "sampling/sampling_logp_difference/mean": 0.013834046199917793, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 294.578125, "completions/mean_terminated_length": 294.578125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.22867414355278015, "epoch": 0.7696078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.9382855513200445, "kl": 0.07310361415147781, "learning_rate": 9.281243428374701e-07, "loss": 0.0048, "num_tokens": 19774457.0, "reward": -0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003331899642944, "sampling/importance_sampling_ratio/min": 0.1942329853773117, "sampling/sampling_logp_difference/max": 1.6386969089508057, "sampling/sampling_logp_difference/mean": 0.014818103052675724, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 219.34375, "completions/mean_terminated_length": 219.34375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.23702575266361237, "epoch": 0.7708333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 1.38462137376169, "kl": 0.06711365282535553, "learning_rate": 9.277559190581669e-07, "loss": -0.0636, "num_tokens": 19812911.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6645729541778564, "sampling/importance_sampling_ratio/mean": 1.0004074573516846, "sampling/importance_sampling_ratio/min": 0.3960508406162262, "sampling/sampling_logp_difference/max": 0.9262127876281738, "sampling/sampling_logp_difference/mean": 0.014895858243107796, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 164.15625, "completions/mean_terminated_length": 164.15625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2659897804260254, "epoch": 0.7720588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 1.7038503647603291, "kl": 0.09886568784713745, "learning_rate": 9.273866269614473e-07, "loss": -0.0624, "num_tokens": 19838233.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.785392165184021, "sampling/importance_sampling_ratio/mean": 1.0003658533096313, "sampling/importance_sampling_ratio/min": 0.5384262204170227, "sampling/sampling_logp_difference/max": 0.6191048622131348, "sampling/sampling_logp_difference/mean": 0.015854064375162125, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1654.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 255.984375, "completions/mean_terminated_length": 255.984375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2972226142883301, "epoch": 0.7732843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.5270971962584234, "kl": 0.060013528913259506, "learning_rate": 9.270164672969507e-07, "loss": -0.007, "num_tokens": 19867592.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5072145462036133, "sampling/importance_sampling_ratio/mean": 0.9994542598724365, "sampling/importance_sampling_ratio/min": 0.3650263249874115, "sampling/sampling_logp_difference/max": 1.0077857971191406, "sampling/sampling_logp_difference/mean": 0.016513332724571228, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 244.828125, "completions/mean_terminated_length": 244.828125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2695668339729309, "epoch": 0.7745098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 1.3303767200565322, "kl": 0.07084056735038757, "learning_rate": 9.266454408160777e-07, "loss": -0.0347, "num_tokens": 19899885.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007511377334595, "sampling/importance_sampling_ratio/min": 0.4982454776763916, "sampling/sampling_logp_difference/max": 0.8159983158111572, "sampling/sampling_logp_difference/mean": 0.016307897865772247, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 155.046875, "completions/mean_terminated_length": 155.046875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.17744068801403046, "epoch": 0.7757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.10603676583593676, "kl": 0.07468618452548981, "learning_rate": 9.262735482719887e-07, "loss": 0.0007, "num_tokens": 19923392.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002634525299072, "sampling/importance_sampling_ratio/min": 0.5914404988288879, "sampling/sampling_logp_difference/max": 0.8146648406982422, "sampling/sampling_logp_difference/mean": 0.014950464479625225, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 207.546875, "completions/mean_terminated_length": 207.546875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2222384810447693, "epoch": 0.7769607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.3618649667774445, "kl": 0.07804533839225769, "learning_rate": 9.259007904196021e-07, "loss": -0.0396, "num_tokens": 19955219.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5744249820709229, "sampling/importance_sampling_ratio/mean": 0.999670147895813, "sampling/importance_sampling_ratio/min": 0.459637314081192, "sampling/sampling_logp_difference/max": 0.7773175239562988, "sampling/sampling_logp_difference/mean": 0.014833863824605942, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 279.078125, "completions/mean_terminated_length": 279.078125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.28159821033477783, "epoch": 0.7781862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.4507699159557401, "kl": 0.07019156217575073, "learning_rate": 9.255271680155923e-07, "loss": -0.0141, "num_tokens": 19992616.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001819133758545, "sampling/importance_sampling_ratio/min": 0.33303511142730713, "sampling/sampling_logp_difference/max": 1.0995073318481445, "sampling/sampling_logp_difference/mean": 0.015980804339051247, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 210.671875, "completions/mean_terminated_length": 210.671875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.22452111542224884, "epoch": 0.7794117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 1.3651495022210556, "kl": 0.050095200538635254, "learning_rate": 9.251526818183896e-07, "loss": 0.0018, "num_tokens": 20027315.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6550902128219604, "sampling/importance_sampling_ratio/mean": 0.9999434947967529, "sampling/importance_sampling_ratio/min": 0.43477028608322144, "sampling/sampling_logp_difference/max": 0.832937479019165, "sampling/sampling_logp_difference/mean": 0.015550890006124973, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 190.734375, "completions/mean_terminated_length": 190.734375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.1741478443145752, "epoch": 0.7806372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.4642627408447395, "kl": 0.05440554767847061, "learning_rate": 9.247773325881769e-07, "loss": -0.0098, "num_tokens": 20055058.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7198848724365234, "sampling/importance_sampling_ratio/mean": 0.9996518492698669, "sampling/importance_sampling_ratio/min": 0.5685391426086426, "sampling/sampling_logp_difference/max": 0.5646851062774658, "sampling/sampling_logp_difference/mean": 0.012164910323917866, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 194.984375, "completions/mean_terminated_length": 194.984375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.22004134953022003, "epoch": 0.7818627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 1.320737729904589, "kl": 0.07392781972885132, "learning_rate": 9.244011210868895e-07, "loss": -0.0062, "num_tokens": 20085537.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.8553956747055054, "sampling/importance_sampling_ratio/mean": 0.9997206926345825, "sampling/importance_sampling_ratio/min": 0.42811888456344604, "sampling/sampling_logp_difference/max": 0.8483543395996094, "sampling/sampling_logp_difference/mean": 0.015586144290864468, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 196.03125, "completions/mean_terminated_length": 196.03125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.19160839915275574, "epoch": 0.7830882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.052688931066442, "kl": 0.0540931262075901, "learning_rate": 9.240240480782129e-07, "loss": 0.0005, "num_tokens": 20114563.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5912364721298218, "sampling/importance_sampling_ratio/mean": 0.9991684556007385, "sampling/importance_sampling_ratio/min": 0.42439547181129456, "sampling/sampling_logp_difference/max": 0.8570895195007324, "sampling/sampling_logp_difference/mean": 0.014237429946660995, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 239.296875, "completions/mean_terminated_length": 239.296875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.27263566851615906, "epoch": 0.7843137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.5731791942277236, "kl": 0.06309030950069427, "learning_rate": 9.236461143275815e-07, "loss": -0.0386, "num_tokens": 20149846.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.57704496383667, "sampling/importance_sampling_ratio/mean": 1.0001354217529297, "sampling/importance_sampling_ratio/min": 0.5680131316184998, "sampling/sampling_logp_difference/max": 0.5656107664108276, "sampling/sampling_logp_difference/mean": 0.015975212678313255, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 209.796875, "completions/mean_terminated_length": 209.796875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.25127124786376953, "epoch": 0.7855392156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.8915945066494564, "kl": 0.11664341390132904, "learning_rate": 9.232673206021767e-07, "loss": -0.0287, "num_tokens": 20177657.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.8150728940963745, "sampling/importance_sampling_ratio/mean": 1.000038743019104, "sampling/importance_sampling_ratio/min": 0.5329506993293762, "sampling/sampling_logp_difference/max": 0.629326343536377, "sampling/sampling_logp_difference/mean": 0.01448842603713274, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 191.046875, "completions/mean_terminated_length": 191.046875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20816347002983093, "epoch": 0.7867647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 1.3465381563685925, "kl": 0.08171716332435608, "learning_rate": 9.228876676709259e-07, "loss": -0.0336, "num_tokens": 20205516.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6132919788360596, "sampling/importance_sampling_ratio/mean": 1.0000238418579102, "sampling/importance_sampling_ratio/min": 0.5158289670944214, "sampling/sampling_logp_difference/max": 0.6619800329208374, "sampling/sampling_logp_difference/mean": 0.012775387614965439, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 199.453125, "completions/mean_terminated_length": 199.453125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2456672191619873, "epoch": 0.7879901960784313, "frac_reward_zero_std": 0.5, "grad_norm": 2.053343337626516, "kl": 0.07074607908725739, "learning_rate": 9.225071563045006e-07, "loss": -0.027, "num_tokens": 20233449.0, "reward": 0.5, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006208419799805, "sampling/importance_sampling_ratio/min": 0.4944218397140503, "sampling/sampling_logp_difference/max": 0.9277139902114868, "sampling/sampling_logp_difference/mean": 0.014937615022063255, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 172.171875, "completions/mean_terminated_length": 172.171875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.22118590772151947, "epoch": 0.7892156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 2.0263598295111773, "kl": 0.07796844840049744, "learning_rate": 9.221257872753144e-07, "loss": 0.007, "num_tokens": 20261348.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992664456367493, "sampling/importance_sampling_ratio/min": 0.3534892499446869, "sampling/sampling_logp_difference/max": 1.0399022102355957, "sampling/sampling_logp_difference/mean": 0.016085071489214897, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 152.078125, "completions/mean_terminated_length": 152.078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.27297282218933105, "epoch": 0.7904411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 2.331542394983071, "kl": 0.1148001030087471, "learning_rate": 9.217435613575226e-07, "loss": 0.0129, "num_tokens": 20286313.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8570010662078857, "sampling/importance_sampling_ratio/mean": 1.000152587890625, "sampling/importance_sampling_ratio/min": 0.6056237816810608, "sampling/sampling_logp_difference/max": 0.6189628839492798, "sampling/sampling_logp_difference/mean": 0.01716921478509903, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 162.703125, "completions/mean_terminated_length": 162.703125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.19279450178146362, "epoch": 0.7916666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 1.211441792495886, "kl": 0.07502549886703491, "learning_rate": 9.213604793270196e-07, "loss": -0.008, "num_tokens": 20311382.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006800889968872, "sampling/importance_sampling_ratio/min": 0.2587150037288666, "sampling/sampling_logp_difference/max": 1.3520281314849854, "sampling/sampling_logp_difference/mean": 0.014370124787092209, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 263.8125, "completions/mean_terminated_length": 263.8125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.29367929697036743, "epoch": 0.7928921568627451, "frac_reward_zero_std": 0.25, "grad_norm": 1.6522798136680616, "kl": 0.0979141965508461, "learning_rate": 9.209765419614373e-07, "loss": 0.0023, "num_tokens": 20343178.0, "reward": 0.0625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5994728803634644, "sampling/importance_sampling_ratio/mean": 1.0001908540725708, "sampling/importance_sampling_ratio/min": 0.296130895614624, "sampling/sampling_logp_difference/max": 1.2169537544250488, "sampling/sampling_logp_difference/mean": 0.01599789410829544, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 250.40625, "completions/mean_terminated_length": 250.40625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2368079423904419, "epoch": 0.7941176470588235, "frac_reward_zero_std": 0.75, "grad_norm": 0.7249496990139329, "kl": 0.05633680149912834, "learning_rate": 9.205917500401447e-07, "loss": -0.0046, "num_tokens": 20377812.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7612030506134033, "sampling/importance_sampling_ratio/mean": 1.0002657175064087, "sampling/importance_sampling_ratio/min": 0.00012346301809884608, "sampling/sampling_logp_difference/max": 8.999568939208984, "sampling/sampling_logp_difference/mean": 0.014381260611116886, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 189.859375, "completions/mean_terminated_length": 189.859375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.27118250727653503, "epoch": 0.7953431372549019, "frac_reward_zero_std": 0.5, "grad_norm": 1.91543302081558, "kl": 0.0993255227804184, "learning_rate": 9.202061043442447e-07, "loss": -0.0498, "num_tokens": 20405339.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003424882888794, "sampling/importance_sampling_ratio/min": 0.3208965063095093, "sampling/sampling_logp_difference/max": 1.1366366147994995, "sampling/sampling_logp_difference/mean": 0.015637464821338654, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 218.90625, "completions/mean_terminated_length": 218.90625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.31074705719947815, "epoch": 0.7965686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 1.765669451975907, "kl": 0.06831445544958115, "learning_rate": 9.198196056565738e-07, "loss": -0.0094, "num_tokens": 20438261.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000112533569336, "sampling/importance_sampling_ratio/min": 0.37548714876174927, "sampling/sampling_logp_difference/max": 0.9795310497283936, "sampling/sampling_logp_difference/mean": 0.017085541039705276, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 279.390625, "completions/mean_terminated_length": 279.390625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.29999005794525146, "epoch": 0.7977941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.1497807854724997, "kl": 0.06688694655895233, "learning_rate": 9.194322547616997e-07, "loss": -0.0281, "num_tokens": 20474110.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996974468231201, "sampling/importance_sampling_ratio/min": 0.42440560460090637, "sampling/sampling_logp_difference/max": 0.8570656776428223, "sampling/sampling_logp_difference/mean": 0.015580703504383564, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 315.265625, "completions/mean_terminated_length": 315.265625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2834051251411438, "epoch": 0.7990196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.226907817093407, "kl": 0.06724686920642853, "learning_rate": 9.190440524459202e-07, "loss": 0.0032, "num_tokens": 20515055.0, "reward": 0.09375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.775849461555481, "sampling/importance_sampling_ratio/mean": 1.0002250671386719, "sampling/importance_sampling_ratio/min": 0.3902806341648102, "sampling/sampling_logp_difference/max": 0.9408892393112183, "sampling/sampling_logp_difference/mean": 0.015994053333997726, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 294.40625, "completions/mean_terminated_length": 294.40625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.21584855020046234, "epoch": 0.8002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.0190111017409385, "kl": 0.07001065462827682, "learning_rate": 9.186549994972616e-07, "loss": 0.0081, "num_tokens": 20553129.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996395707130432, "sampling/importance_sampling_ratio/min": 0.4057103395462036, "sampling/sampling_logp_difference/max": 0.9528882503509521, "sampling/sampling_logp_difference/mean": 0.012972021475434303, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 294.984375, "completions/mean_terminated_length": 294.984375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3439209461212158, "epoch": 0.8014705882352942, "frac_reward_zero_std": 0.25, "grad_norm": 1.6817432592765151, "kl": 0.10327893495559692, "learning_rate": 9.182650967054766e-07, "loss": -0.0347, "num_tokens": 20591640.0, "reward": 0.78125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000304937362671, "sampling/importance_sampling_ratio/min": 0.051093198359012604, "sampling/sampling_logp_difference/max": 2.9741039276123047, "sampling/sampling_logp_difference/mean": 0.016730263829231262, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 252.15625, "completions/mean_terminated_length": 252.15625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3634037971496582, "epoch": 0.8026960784313726, "frac_reward_zero_std": 0.0, "grad_norm": 2.508760975787, "kl": 0.09479008615016937, "learning_rate": 9.178743448620431e-07, "loss": 0.0139, "num_tokens": 20625266.0, "reward": 0.03125, "reward_std": 0.7667282819747925, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004658699035645, "sampling/importance_sampling_ratio/min": 0.5634726285934448, "sampling/sampling_logp_difference/max": 0.9256799221038818, "sampling/sampling_logp_difference/mean": 0.01804208755493164, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2615200877189636, "epoch": 0.803921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.267371213485665, "kl": 0.09800644218921661, "learning_rate": 9.174827447601627e-07, "loss": 0.0391, "num_tokens": 20661058.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6629855632781982, "sampling/importance_sampling_ratio/mean": 0.9998881816864014, "sampling/importance_sampling_ratio/min": 0.4334590435028076, "sampling/sampling_logp_difference/max": 0.8359580039978027, "sampling/sampling_logp_difference/mean": 0.011825084686279297, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 311.828125, "completions/mean_terminated_length": 311.828125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2893800139427185, "epoch": 0.8051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.9510614377495171, "kl": 0.09265218675136566, "learning_rate": 9.170902971947588e-07, "loss": -0.006, "num_tokens": 20698935.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 0.9994891881942749, "sampling/importance_sampling_ratio/min": 0.26814839243888855, "sampling/sampling_logp_difference/max": 1.3162147998809814, "sampling/sampling_logp_difference/mean": 0.015570033341646194, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 195.484375, "completions/mean_terminated_length": 195.484375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.21507211029529572, "epoch": 0.8063725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.0788738109658864, "kl": 0.10468155145645142, "learning_rate": 9.166970029624749e-07, "loss": 0.001, "num_tokens": 20725398.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.581348180770874, "sampling/importance_sampling_ratio/mean": 0.9999153017997742, "sampling/importance_sampling_ratio/min": 0.6252502799034119, "sampling/sampling_logp_difference/max": 0.4696033000946045, "sampling/sampling_logp_difference/mean": 0.013945435173809528, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 223.9375, "completions/mean_terminated_length": 223.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.22752121090888977, "epoch": 0.8075980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.06139111869251549, "kl": 0.10037748515605927, "learning_rate": 9.163028628616738e-07, "loss": 0.001, "num_tokens": 20757906.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8718841075897217, "sampling/importance_sampling_ratio/mean": 1.0001585483551025, "sampling/importance_sampling_ratio/min": 0.5158049464225769, "sampling/sampling_logp_difference/max": 0.6620266437530518, "sampling/sampling_logp_difference/mean": 0.013948909007012844, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 296.078125, "completions/mean_terminated_length": 296.078125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3812859058380127, "epoch": 0.8088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.4307012265727914, "kl": 0.11581394076347351, "learning_rate": 9.159078776924345e-07, "loss": -0.0174, "num_tokens": 20794663.0, "reward": 0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6047122478485107, "sampling/importance_sampling_ratio/mean": 1.0004184246063232, "sampling/importance_sampling_ratio/min": 0.6100687980651855, "sampling/sampling_logp_difference/max": 0.4941835403442383, "sampling/sampling_logp_difference/mean": 0.018269555643200874, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 301.53125, "completions/mean_terminated_length": 301.53125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2671017646789551, "epoch": 0.8100490196078431, "frac_reward_zero_std": 0.25, "grad_norm": 1.6516933420548836, "kl": 0.10355545580387115, "learning_rate": 9.155120482565519e-07, "loss": 0.0121, "num_tokens": 20831353.0, "reward": 0.0625, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006881952285767, "sampling/importance_sampling_ratio/min": 0.4191407561302185, "sampling/sampling_logp_difference/max": 0.869548499584198, "sampling/sampling_logp_difference/mean": 0.014300312846899033, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 230.671875, "completions/mean_terminated_length": 230.671875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.23248368501663208, "epoch": 0.8112745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.1018616070568348, "kl": 0.10352177917957306, "learning_rate": 9.15115375357535e-07, "loss": -0.0034, "num_tokens": 20860932.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5986348390579224, "sampling/importance_sampling_ratio/mean": 1.000486969947815, "sampling/importance_sampling_ratio/min": 0.6132914423942566, "sampling/sampling_logp_difference/max": 0.48891496658325195, "sampling/sampling_logp_difference/mean": 0.013316703960299492, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 248.53125, "completions/mean_terminated_length": 248.53125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2861756682395935, "epoch": 0.8125, "frac_reward_zero_std": 0.75, "grad_norm": 0.9778410901092632, "kl": 0.1253623366355896, "learning_rate": 9.147178598006044e-07, "loss": 0.0195, "num_tokens": 20892838.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.7730779647827148, "sampling/importance_sampling_ratio/mean": 1.0011225938796997, "sampling/importance_sampling_ratio/min": 0.5483730435371399, "sampling/sampling_logp_difference/max": 0.6007994413375854, "sampling/sampling_logp_difference/mean": 0.01466517522931099, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 221.359375, "completions/mean_terminated_length": 221.359375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3348690867424011, "epoch": 0.8137254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.06287644422205922, "kl": 0.10089477151632309, "learning_rate": 9.143195023926917e-07, "loss": 0.0011, "num_tokens": 20921373.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9979796409606934, "sampling/importance_sampling_ratio/mean": 1.0010348558425903, "sampling/importance_sampling_ratio/min": 0.44230785965919495, "sampling/sampling_logp_difference/max": 0.8157491683959961, "sampling/sampling_logp_difference/mean": 0.01741403341293335, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 367.046875, "completions/mean_terminated_length": 367.046875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.3210401237010956, "epoch": 0.8149509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.8162083856121677, "kl": 0.05032963678240776, "learning_rate": 9.139203039424368e-07, "loss": 0.0097, "num_tokens": 20962144.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999954104423523, "sampling/importance_sampling_ratio/min": 0.43094602227211, "sampling/sampling_logp_difference/max": 0.923588752746582, "sampling/sampling_logp_difference/mean": 0.014368601143360138, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.33943772315979004, "epoch": 0.8161764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 2.027183857117543, "kl": 0.1429498791694641, "learning_rate": 9.135202652601876e-07, "loss": 0.0098, "num_tokens": 20990488.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6068168878555298, "sampling/importance_sampling_ratio/mean": 0.9997010827064514, "sampling/importance_sampling_ratio/min": 0.4161491394042969, "sampling/sampling_logp_difference/max": 0.8767116069793701, "sampling/sampling_logp_difference/mean": 0.01838063821196556, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 325.765625, "completions/mean_terminated_length": 325.765625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3099474310874939, "epoch": 0.8174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.8867191688162285, "kl": 0.07996654510498047, "learning_rate": 9.131193871579974e-07, "loss": -0.0222, "num_tokens": 21039177.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001814365386963, "sampling/importance_sampling_ratio/min": 0.5488417148590088, "sampling/sampling_logp_difference/max": 0.7263705730438232, "sampling/sampling_logp_difference/mean": 0.015426268801093102, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 267.765625, "completions/mean_terminated_length": 267.765625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.32460522651672363, "epoch": 0.8186274509803921, "frac_reward_zero_std": 0.25, "grad_norm": 1.9396556694981224, "kl": 0.13725237548351288, "learning_rate": 9.127176704496231e-07, "loss": -0.0005, "num_tokens": 21077546.0, "reward": 0.78125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6607247591018677, "sampling/importance_sampling_ratio/mean": 0.9998514652252197, "sampling/importance_sampling_ratio/min": 0.30301758646965027, "sampling/sampling_logp_difference/max": 1.1939644813537598, "sampling/sampling_logp_difference/mean": 0.018017791211605072, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 283.59375, "completions/mean_terminated_length": 283.59375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3330175280570984, "epoch": 0.8198529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2150621305012839, "kl": 0.1321268230676651, "learning_rate": 9.123151159505241e-07, "loss": 0.0117, "num_tokens": 21109408.0, "reward": -0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8197921514511108, "sampling/importance_sampling_ratio/mean": 0.9998218417167664, "sampling/importance_sampling_ratio/min": 0.5043226480484009, "sampling/sampling_logp_difference/max": 0.6845390796661377, "sampling/sampling_logp_difference/mean": 0.015419330447912216, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 247.515625, "completions/mean_terminated_length": 247.515625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.31832343339920044, "epoch": 0.821078431372549, "frac_reward_zero_std": 0.25, "grad_norm": 1.7962435540211532, "kl": 0.11252126097679138, "learning_rate": 9.119117244778607e-07, "loss": -0.0068, "num_tokens": 21145073.0, "reward": 0.78125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6199977397918701, "sampling/importance_sampling_ratio/mean": 1.0006837844848633, "sampling/importance_sampling_ratio/min": 0.5143709182739258, "sampling/sampling_logp_difference/max": 0.6648106575012207, "sampling/sampling_logp_difference/mean": 0.017651362344622612, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 191.21875, "completions/mean_terminated_length": 191.21875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.29982662200927734, "epoch": 0.8223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.5511913501152876, "kl": 0.100832000374794, "learning_rate": 9.115074968504921e-07, "loss": 0.0226, "num_tokens": 21178943.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996993541717529, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 1.125089168548584, "sampling/sampling_logp_difference/mean": 0.016022585332393646, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 252.46875, "completions/mean_terminated_length": 252.46875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2658326029777527, "epoch": 0.8235294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.8602775957763744, "kl": 0.08079873025417328, "learning_rate": 9.111024338889746e-07, "loss": -0.0058, "num_tokens": 21209245.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6587384939193726, "sampling/importance_sampling_ratio/mean": 0.9992792010307312, "sampling/importance_sampling_ratio/min": 0.47799503803253174, "sampling/sampling_logp_difference/max": 0.7381548881530762, "sampling/sampling_logp_difference/mean": 0.014690998010337353, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 235.390625, "completions/mean_terminated_length": 235.390625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.24077001214027405, "epoch": 0.8247549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.07431353902079195, "kl": 0.07888946682214737, "learning_rate": 9.106965364155605e-07, "loss": 0.0007, "num_tokens": 21243622.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5283786058425903, "sampling/importance_sampling_ratio/mean": 0.9998764991760254, "sampling/importance_sampling_ratio/min": 0.5260623097419739, "sampling/sampling_logp_difference/max": 0.6423356533050537, "sampling/sampling_logp_difference/mean": 0.014851542189717293, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 247.578125, "completions/mean_terminated_length": 247.578125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.25872746109962463, "epoch": 0.8259803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.8442583116902658, "kl": 0.1108008325099945, "learning_rate": 9.102898052541957e-07, "loss": -0.0119, "num_tokens": 21280155.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6926437616348267, "sampling/importance_sampling_ratio/mean": 0.9992407560348511, "sampling/importance_sampling_ratio/min": 0.5154118537902832, "sampling/sampling_logp_difference/max": 0.662788987159729, "sampling/sampling_logp_difference/mean": 0.014943244867026806, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.21067966520786285, "epoch": 0.8272058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.03647778601333582, "kl": 0.07727044075727463, "learning_rate": 9.09882241230519e-07, "loss": 0.0006, "num_tokens": 21310011.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4726886749267578, "sampling/importance_sampling_ratio/mean": 0.9998340606689453, "sampling/importance_sampling_ratio/min": 0.6132952570915222, "sampling/sampling_logp_difference/max": 0.4889087677001953, "sampling/sampling_logp_difference/mean": 0.012996380217373371, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 267.984375, "completions/mean_terminated_length": 267.984375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2664591073989868, "epoch": 0.8284313725490197, "frac_reward_zero_std": 0.5, "grad_norm": 1.2194415878196174, "kl": 0.09758429229259491, "learning_rate": 9.094738451718593e-07, "loss": -0.026, "num_tokens": 21343226.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5972039699554443, "sampling/importance_sampling_ratio/mean": 1.000079870223999, "sampling/importance_sampling_ratio/min": 0.6164050102233887, "sampling/sampling_logp_difference/max": 0.4838510751724243, "sampling/sampling_logp_difference/mean": 0.014850573614239693, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 244.796875, "completions/mean_terminated_length": 244.796875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.25565797090530396, "epoch": 0.8296568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0563848071739717, "kl": 0.07639829814434052, "learning_rate": 9.09064617907235e-07, "loss": 0.0007, "num_tokens": 21373021.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8061039447784424, "sampling/importance_sampling_ratio/mean": 1.0006325244903564, "sampling/importance_sampling_ratio/min": 0.4712215065956116, "sampling/sampling_logp_difference/max": 0.7524270415306091, "sampling/sampling_logp_difference/mean": 0.013312511146068573, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 212.203125, "completions/mean_terminated_length": 212.203125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.26213979721069336, "epoch": 0.8308823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 1.4030879522329296, "kl": 0.1045461893081665, "learning_rate": 9.086545602673513e-07, "loss": -0.0309, "num_tokens": 21401386.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6100748777389526, "sampling/importance_sampling_ratio/mean": 1.0002021789550781, "sampling/importance_sampling_ratio/min": 0.36000654101371765, "sampling/sampling_logp_difference/max": 1.0216331481933594, "sampling/sampling_logp_difference/mean": 0.015743952244520187, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 287.1875, "completions/mean_terminated_length": 287.1875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.31720712780952454, "epoch": 0.8321078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.2377377716101043, "kl": 0.09231787919998169, "learning_rate": 9.082436730845993e-07, "loss": 0.0323, "num_tokens": 21437542.0, "reward": 0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6108328104019165, "sampling/importance_sampling_ratio/mean": 1.0001319646835327, "sampling/importance_sampling_ratio/min": 0.4297125041484833, "sampling/sampling_logp_difference/max": 0.8446389436721802, "sampling/sampling_logp_difference/mean": 0.014885222539305687, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 271.65625, "completions/mean_terminated_length": 271.65625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.33510950207710266, "epoch": 0.8333333333333334, "frac_reward_zero_std": 0.5, "grad_norm": 1.5882620054880845, "kl": 0.09794466197490692, "learning_rate": 9.07831957193054e-07, "loss": 0.0252, "num_tokens": 21476640.0, "reward": -0.3125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.842240333557129, "sampling/importance_sampling_ratio/mean": 0.9999523162841797, "sampling/importance_sampling_ratio/min": 0.4911081790924072, "sampling/sampling_logp_difference/max": 0.7110908031463623, "sampling/sampling_logp_difference/mean": 0.015907227993011475, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 191.578125, "completions/mean_terminated_length": 191.578125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3127414882183075, "epoch": 0.8345588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.059262354954274656, "kl": 0.11319208145141602, "learning_rate": 9.074194134284725e-07, "loss": 0.0011, "num_tokens": 21507397.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000261068344116, "sampling/importance_sampling_ratio/min": 0.19975581765174866, "sampling/sampling_logp_difference/max": 1.6106595993041992, "sampling/sampling_logp_difference/mean": 0.0173744335770607, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 248.9375, "completions/mean_terminated_length": 248.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3500162661075592, "epoch": 0.8357843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.653802968950841, "kl": 0.10595337301492691, "learning_rate": 9.070060426282924e-07, "loss": -0.0018, "num_tokens": 21542417.0, "reward": 0.03125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.734398365020752, "sampling/importance_sampling_ratio/mean": 1.0001394748687744, "sampling/importance_sampling_ratio/min": 0.0026407502591609955, "sampling/sampling_logp_difference/max": 5.936692237854004, "sampling/sampling_logp_difference/mean": 0.017924603074789047, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 288.09375, "completions/mean_terminated_length": 288.09375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3846856355667114, "epoch": 0.8370098039215687, "frac_reward_zero_std": 0.5, "grad_norm": 1.4509856277488788, "kl": 0.07933727651834488, "learning_rate": 9.065918456316303e-07, "loss": -0.0385, "num_tokens": 21575911.0, "reward": 0.0, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9828444719314575, "sampling/importance_sampling_ratio/mean": 0.9999169707298279, "sampling/importance_sampling_ratio/min": 0.5102310180664062, "sampling/sampling_logp_difference/max": 0.6845324039459229, "sampling/sampling_logp_difference/mean": 0.01683010533452034, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 321.171875, "completions/mean_terminated_length": 321.171875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.35549187660217285, "epoch": 0.8382352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 1.105030079988878, "kl": 0.09580346941947937, "learning_rate": 9.061768232792802e-07, "loss": 0.0505, "num_tokens": 21621730.0, "reward": 0.28125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997661113739014, "sampling/importance_sampling_ratio/min": 0.620428204536438, "sampling/sampling_logp_difference/max": 0.736009955406189, "sampling/sampling_logp_difference/mean": 0.01514836959540844, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 257.46875, "completions/mean_terminated_length": 257.46875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.34016239643096924, "epoch": 0.8394607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 1.2922578121973038, "kl": 0.11721494048833847, "learning_rate": 9.057609764137109e-07, "loss": 0.0309, "num_tokens": 21658128.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.7992300987243652, "sampling/importance_sampling_ratio/mean": 1.0000033378601074, "sampling/importance_sampling_ratio/min": 0.36027073860168457, "sampling/sampling_logp_difference/max": 1.0208995342254639, "sampling/sampling_logp_difference/mean": 0.016892150044441223, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 299.421875, "completions/mean_terminated_length": 299.421875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.37253424525260925, "epoch": 0.8406862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.221488784134185, "kl": 0.11370334774255753, "learning_rate": 9.053443058790651e-07, "loss": 0.0238, "num_tokens": 21696587.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004000663757324, "sampling/importance_sampling_ratio/min": 0.5096390247344971, "sampling/sampling_logp_difference/max": 0.776209831237793, "sampling/sampling_logp_difference/mean": 0.016573626548051834, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 248.8125, "completions/mean_terminated_length": 248.8125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3833868205547333, "epoch": 0.8419117647058824, "frac_reward_zero_std": 0.25, "grad_norm": 2.2787064048635344, "kl": 0.13017961382865906, "learning_rate": 9.049268125211575e-07, "loss": -0.0214, "num_tokens": 21729791.0, "reward": 0.15625, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.617061972618103, "sampling/importance_sampling_ratio/mean": 0.9997603297233582, "sampling/importance_sampling_ratio/min": 0.5065748691558838, "sampling/sampling_logp_difference/max": 0.6800830364227295, "sampling/sampling_logp_difference/mean": 0.01717999391257763, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 179.84375, "completions/mean_terminated_length": 179.84375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.20183613896369934, "epoch": 0.8431372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0681915059421622, "kl": 0.09983283281326294, "learning_rate": 9.045084971874737e-07, "loss": 0.0009, "num_tokens": 21758629.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000123977661133, "sampling/importance_sampling_ratio/min": 0.5593070387840271, "sampling/sampling_logp_difference/max": 0.7898805141448975, "sampling/sampling_logp_difference/mean": 0.013118119910359383, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 255.46875, "completions/mean_terminated_length": 255.46875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.267900288105011, "epoch": 0.8443627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.8681915213752832, "kl": 0.09064662456512451, "learning_rate": 9.040893607271668e-07, "loss": 0.0087, "num_tokens": 21799875.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8082319498062134, "sampling/importance_sampling_ratio/mean": 1.000293254852295, "sampling/importance_sampling_ratio/min": 0.035122405737638474, "sampling/sampling_logp_difference/max": 3.3489160537719727, "sampling/sampling_logp_difference/mean": 0.014291105791926384, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 194.34375, "completions/mean_terminated_length": 194.34375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.32394784688949585, "epoch": 0.8455882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 1.4189443295984745, "kl": 0.11329706013202667, "learning_rate": 9.036694039910576e-07, "loss": 0.0161, "num_tokens": 21828297.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.7528784275054932, "sampling/importance_sampling_ratio/mean": 0.9998365640640259, "sampling/importance_sampling_ratio/min": 0.44248515367507935, "sampling/sampling_logp_difference/max": 0.8153483867645264, "sampling/sampling_logp_difference/mean": 0.01646936871111393, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 199.78125, "completions/mean_terminated_length": 199.78125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3520262539386749, "epoch": 0.8468137254901961, "frac_reward_zero_std": 0.25, "grad_norm": 2.1899806532441306, "kl": 0.1304454207420349, "learning_rate": 9.032486278316313e-07, "loss": -0.0161, "num_tokens": 21858795.0, "reward": -0.1875, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002292394638062, "sampling/importance_sampling_ratio/min": 0.4083217978477478, "sampling/sampling_logp_difference/max": 0.895699679851532, "sampling/sampling_logp_difference/mean": 0.01799740642309189, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 211.203125, "completions/mean_terminated_length": 211.203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.35917019844055176, "epoch": 0.8480392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.9798873160668752, "kl": 0.11700897663831711, "learning_rate": 9.028270331030372e-07, "loss": -0.0038, "num_tokens": 21890408.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6099342107772827, "sampling/importance_sampling_ratio/mean": 1.0000842809677124, "sampling/importance_sampling_ratio/min": 0.5990613698959351, "sampling/sampling_logp_difference/max": 0.512391209602356, "sampling/sampling_logp_difference/mean": 0.01729101501405239, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 163.296875, "completions/mean_terminated_length": 163.296875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3510112762451172, "epoch": 0.8492647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 2.2123932223640246, "kl": 0.14133670926094055, "learning_rate": 9.024046206610857e-07, "loss": -0.0076, "num_tokens": 21919771.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6247493028640747, "sampling/importance_sampling_ratio/mean": 1.0004782676696777, "sampling/importance_sampling_ratio/min": 0.6176419258117676, "sampling/sampling_logp_difference/max": 0.4853534698486328, "sampling/sampling_logp_difference/mean": 0.017106808722019196, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 205.328125, "completions/mean_terminated_length": 205.328125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.29586395621299744, "epoch": 0.8504901960784313, "frac_reward_zero_std": 0.75, "grad_norm": 1.1910071691315411, "kl": 0.11373498290777206, "learning_rate": 9.019813913632475e-07, "loss": 0.0064, "num_tokens": 21950800.0, "reward": -0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9536798000335693, "sampling/importance_sampling_ratio/mean": 0.9998559951782227, "sampling/importance_sampling_ratio/min": 0.42452549934387207, "sampling/sampling_logp_difference/max": 0.8567832708358765, "sampling/sampling_logp_difference/mean": 0.01530240848660469, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 139.984375, "completions/mean_terminated_length": 139.984375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.29637232422828674, "epoch": 0.8517156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0666575119247821, "kl": 0.11328499764204025, "learning_rate": 9.015573460686509e-07, "loss": 0.0012, "num_tokens": 21976303.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.649452567100525, "sampling/importance_sampling_ratio/mean": 1.0007152557373047, "sampling/importance_sampling_ratio/min": 0.3134042024612427, "sampling/sampling_logp_difference/max": 1.160261631011963, "sampling/sampling_logp_difference/mean": 0.018013473600149155, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 155.609375, "completions/mean_terminated_length": 155.609375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.34009021520614624, "epoch": 0.8529411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 2.2575099023093053, "kl": 0.19062024354934692, "learning_rate": 9.011324856380813e-07, "loss": 0.0463, "num_tokens": 22003206.0, "reward": 0.125, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001416206359863, "sampling/importance_sampling_ratio/min": 0.4293578863143921, "sampling/sampling_logp_difference/max": 0.9967107772827148, "sampling/sampling_logp_difference/mean": 0.018071670085191727, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 218.46875, "completions/mean_terminated_length": 218.46875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.41182026267051697, "epoch": 0.8541666666666666, "frac_reward_zero_std": 0.5, "grad_norm": 1.5031797223895609, "kl": 0.13558265566825867, "learning_rate": 9.007068109339783e-07, "loss": -0.0399, "num_tokens": 22036180.0, "reward": 0.53125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000173807144165, "sampling/importance_sampling_ratio/min": 0.5055890083312988, "sampling/sampling_logp_difference/max": 1.032625675201416, "sampling/sampling_logp_difference/mean": 0.01840224117040634, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 181.015625, "completions/mean_terminated_length": 181.015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.28813767433166504, "epoch": 0.8553921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.3615341271921366, "kl": 0.09089133143424988, "learning_rate": 9.002803228204348e-07, "loss": 0.0068, "num_tokens": 22067205.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999821186065674, "sampling/importance_sampling_ratio/min": 0.4525822401046753, "sampling/sampling_logp_difference/max": 0.987877607345581, "sampling/sampling_logp_difference/mean": 0.015335430391132832, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 196.90625, "completions/mean_terminated_length": 196.90625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3749622106552124, "epoch": 0.8566176470588235, "frac_reward_zero_std": 0.75, "grad_norm": 1.1167853118622275, "kl": 0.14343884587287903, "learning_rate": 8.998530221631941e-07, "loss": 0.0398, "num_tokens": 22099855.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.605394721031189, "sampling/importance_sampling_ratio/mean": 0.9992402791976929, "sampling/importance_sampling_ratio/min": 0.40643367171287537, "sampling/sampling_logp_difference/max": 0.9003345966339111, "sampling/sampling_logp_difference/mean": 0.017833959311246872, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 204.109375, "completions/mean_terminated_length": 204.109375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.32660678029060364, "epoch": 0.8578431372549019, "frac_reward_zero_std": 0.5, "grad_norm": 2.2200355245520216, "kl": 0.1060405820608139, "learning_rate": 8.994249098296502e-07, "loss": 0.0273, "num_tokens": 22130614.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5495752096176147, "sampling/importance_sampling_ratio/mean": 0.9998863935470581, "sampling/importance_sampling_ratio/min": 0.6147378087043762, "sampling/sampling_logp_difference/max": 0.4865594506263733, "sampling/sampling_logp_difference/mean": 0.015310069546103477, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 156.265625, "completions/mean_terminated_length": 156.265625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3106486201286316, "epoch": 0.8590686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.22984067557623256, "kl": 0.12091071158647537, "learning_rate": 8.989959866888437e-07, "loss": 0.0012, "num_tokens": 22160279.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6522029638290405, "sampling/importance_sampling_ratio/mean": 0.9998528361320496, "sampling/importance_sampling_ratio/min": 0.09124992042779922, "sampling/sampling_logp_difference/max": 2.394153118133545, "sampling/sampling_logp_difference/mean": 0.016085583716630936, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 157.46875, "completions/mean_terminated_length": 157.46875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.24609866738319397, "epoch": 0.8602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.09035589869489256, "kl": 0.12495503574609756, "learning_rate": 8.985662536114612e-07, "loss": 0.0012, "num_tokens": 22189877.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9663197994232178, "sampling/importance_sampling_ratio/mean": 0.9997924566268921, "sampling/importance_sampling_ratio/min": 0.3909452557563782, "sampling/sampling_logp_difference/max": 0.9391877055168152, "sampling/sampling_logp_difference/mean": 0.015291010960936546, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.23346850275993347, "epoch": 0.8615196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.0788715941297072, "kl": 0.09071210026741028, "learning_rate": 8.981357114698338e-07, "loss": 0.0009, "num_tokens": 22223541.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7657884359359741, "sampling/importance_sampling_ratio/mean": 1.0000474452972412, "sampling/importance_sampling_ratio/min": 0.5173292756080627, "sampling/sampling_logp_difference/max": 0.6590757369995117, "sampling/sampling_logp_difference/mean": 0.012250609695911407, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 175.265625, "completions/mean_terminated_length": 175.265625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3375943899154663, "epoch": 0.8627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.406638942731287, "kl": 0.1338985413312912, "learning_rate": 8.977043611379349e-07, "loss": -0.0306, "num_tokens": 22251942.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9877376556396484, "sampling/importance_sampling_ratio/mean": 0.9998382925987244, "sampling/importance_sampling_ratio/min": 0.5764951109886169, "sampling/sampling_logp_difference/max": 0.6869971752166748, "sampling/sampling_logp_difference/mean": 0.017610445618629456, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 198.65625, "completions/mean_terminated_length": 198.65625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24917840957641602, "epoch": 0.8639705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.07326889467215009, "kl": 0.08637434244155884, "learning_rate": 8.972722034913781e-07, "loss": 0.0009, "num_tokens": 22288288.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5786354541778564, "sampling/importance_sampling_ratio/mean": 1.000070571899414, "sampling/importance_sampling_ratio/min": 0.5260618925094604, "sampling/sampling_logp_difference/max": 0.642336368560791, "sampling/sampling_logp_difference/mean": 0.01618223637342453, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 164.3125, "completions/mean_terminated_length": 164.3125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3293379843235016, "epoch": 0.8651960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.923492823775697, "kl": 0.19896593689918518, "learning_rate": 8.968392394074163e-07, "loss": 0.0024, "num_tokens": 22314868.0, "reward": -0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995007514953613, "sampling/importance_sampling_ratio/min": 0.5738527774810791, "sampling/sampling_logp_difference/max": 0.7254873514175415, "sampling/sampling_logp_difference/mean": 0.01853271946310997, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 169.65625, "completions/mean_terminated_length": 169.65625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.29563915729522705, "epoch": 0.866421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.316755352173363, "kl": 0.1290314793586731, "learning_rate": 8.964054697649388e-07, "loss": 0.038, "num_tokens": 22344814.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.681364893913269, "sampling/importance_sampling_ratio/mean": 0.9994768500328064, "sampling/importance_sampling_ratio/min": 0.2286689728498459, "sampling/sampling_logp_difference/max": 1.4754798412322998, "sampling/sampling_logp_difference/mean": 0.01796390675008297, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 173.484375, "completions/mean_terminated_length": 173.484375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2853643000125885, "epoch": 0.8676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.07137457226982497, "kl": 0.11410938203334808, "learning_rate": 8.959708954444708e-07, "loss": 0.0011, "num_tokens": 22369357.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996079802513123, "sampling/importance_sampling_ratio/min": 0.5437241792678833, "sampling/sampling_logp_difference/max": 1.5145487785339355, "sampling/sampling_logp_difference/mean": 0.017212729901075363, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 149.46875, "completions/mean_terminated_length": 149.46875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3079431653022766, "epoch": 0.8688725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.4540088112938152, "kl": 0.1305399388074875, "learning_rate": 8.955355173281707e-07, "loss": 0.0104, "num_tokens": 22394171.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6495933532714844, "sampling/importance_sampling_ratio/mean": 0.9999434351921082, "sampling/importance_sampling_ratio/min": 0.5773681402206421, "sampling/sampling_logp_difference/max": 0.549275279045105, "sampling/sampling_logp_difference/mean": 0.016211165115237236, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 177.015625, "completions/mean_terminated_length": 177.015625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2947250008583069, "epoch": 0.8700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.3316971512390994, "kl": 0.11985829472541809, "learning_rate": 8.95099336299828e-07, "loss": 0.0029, "num_tokens": 22424316.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6344491243362427, "sampling/importance_sampling_ratio/mean": 0.999924898147583, "sampling/importance_sampling_ratio/min": 0.5699969530105591, "sampling/sampling_logp_difference/max": 0.5621242523193359, "sampling/sampling_logp_difference/mean": 0.017383884638547897, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 180.46875, "completions/mean_terminated_length": 180.46875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.27492639422416687, "epoch": 0.8713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.05769349472093441, "kl": 0.12104004621505737, "learning_rate": 8.946623532448631e-07, "loss": 0.0011, "num_tokens": 22454314.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5753906965255737, "sampling/importance_sampling_ratio/mean": 0.9996606111526489, "sampling/importance_sampling_ratio/min": 0.5292960405349731, "sampling/sampling_logp_difference/max": 0.6362073421478271, "sampling/sampling_logp_difference/mean": 0.014977009035646915, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 152.40625, "completions/mean_terminated_length": 152.40625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.2546907663345337, "epoch": 0.8725490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.5431543846984839, "kl": 0.1233743354678154, "learning_rate": 8.942245690503238e-07, "loss": 0.012, "num_tokens": 22479860.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5062928199768066, "sampling/importance_sampling_ratio/mean": 0.9999198913574219, "sampling/importance_sampling_ratio/min": 0.6306304335594177, "sampling/sampling_logp_difference/max": 0.46103525161743164, "sampling/sampling_logp_difference/mean": 0.012808259576559067, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 179.40625, "completions/mean_terminated_length": 179.40625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.21484437584877014, "epoch": 0.8737745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0898677002345635, "kl": 0.10381156206130981, "learning_rate": 8.937859846048842e-07, "loss": 0.001, "num_tokens": 22507998.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5619345903396606, "sampling/importance_sampling_ratio/mean": 0.9998908042907715, "sampling/importance_sampling_ratio/min": 0.607501208782196, "sampling/sampling_logp_difference/max": 0.4984011650085449, "sampling/sampling_logp_difference/mean": 0.012432791292667389, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 224.734375, "completions/mean_terminated_length": 224.734375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3591345250606537, "epoch": 0.875, "frac_reward_zero_std": 0.5, "grad_norm": 1.438804498440999, "kl": 0.12299526482820511, "learning_rate": 8.933466007988429e-07, "loss": 0.0304, "num_tokens": 22539133.0, "reward": -0.1875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5407992601394653, "sampling/importance_sampling_ratio/mean": 0.999602735042572, "sampling/importance_sampling_ratio/min": 0.5887510776519775, "sampling/sampling_logp_difference/max": 0.5297517776489258, "sampling/sampling_logp_difference/mean": 0.016061928123235703, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 131.34375, "completions/mean_terminated_length": 131.34375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.21670114994049072, "epoch": 0.8762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0910943446591457, "kl": 0.10475843399763107, "learning_rate": 8.929064185241212e-07, "loss": 0.0011, "num_tokens": 22559667.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.798926591873169, "sampling/importance_sampling_ratio/mean": 1.0005546808242798, "sampling/importance_sampling_ratio/min": 0.4834917485713959, "sampling/sampling_logp_difference/max": 0.7267210483551025, "sampling/sampling_logp_difference/mean": 0.01417156495153904, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 205.71875, "completions/mean_terminated_length": 205.71875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.24957765638828278, "epoch": 0.8774509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.1269812297058832, "kl": 0.08890828490257263, "learning_rate": 8.924654386742611e-07, "loss": 0.0595, "num_tokens": 22589553.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5645136833190918, "sampling/importance_sampling_ratio/mean": 1.0001609325408936, "sampling/importance_sampling_ratio/min": 0.4951017498970032, "sampling/sampling_logp_difference/max": 0.7029919624328613, "sampling/sampling_logp_difference/mean": 0.01214967854321003, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 158.46875, "completions/mean_terminated_length": 158.46875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.26717567443847656, "epoch": 0.8786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.06135570342970846, "kl": 0.11892708390951157, "learning_rate": 8.920236621444242e-07, "loss": 0.0012, "num_tokens": 22617359.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6179018020629883, "sampling/importance_sampling_ratio/mean": 1.000462532043457, "sampling/importance_sampling_ratio/min": 0.6216425895690918, "sampling/sampling_logp_difference/max": 0.48113012313842773, "sampling/sampling_logp_difference/mean": 0.01484605297446251, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 133.203125, "completions/mean_terminated_length": 133.203125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.26337453722953796, "epoch": 0.8799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.3501941892853542, "kl": 0.1662946194410324, "learning_rate": 8.915810898313884e-07, "loss": -0.0012, "num_tokens": 22645692.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.8564430475234985, "sampling/importance_sampling_ratio/mean": 1.0003392696380615, "sampling/importance_sampling_ratio/min": 0.5689576268196106, "sampling/sampling_logp_difference/max": 0.6186623573303223, "sampling/sampling_logp_difference/mean": 0.017064658924937248, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 163.921875, "completions/mean_terminated_length": 163.921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.28888508677482605, "epoch": 0.8811274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.392013915287687, "kl": 0.1110411211848259, "learning_rate": 8.911377226335478e-07, "loss": -0.0236, "num_tokens": 22678407.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000640153884888, "sampling/importance_sampling_ratio/min": 0.3980575203895569, "sampling/sampling_logp_difference/max": 0.9211587905883789, "sampling/sampling_logp_difference/mean": 0.016323750838637352, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 191.9375, "completions/mean_terminated_length": 191.9375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2733198404312134, "epoch": 0.8823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.826168728400338, "kl": 0.13752731680870056, "learning_rate": 8.906935614509095e-07, "loss": 0.0313, "num_tokens": 22706403.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5975159406661987, "sampling/importance_sampling_ratio/mean": 1.0002658367156982, "sampling/importance_sampling_ratio/min": 0.6048009991645813, "sampling/sampling_logp_difference/max": 0.5028557777404785, "sampling/sampling_logp_difference/mean": 0.01455213874578476, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 164.828125, "completions/mean_terminated_length": 164.828125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4618142247200012, "epoch": 0.883578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.8854194347941413, "kl": 0.21749398112297058, "learning_rate": 8.902486071850926e-07, "loss": 0.0349, "num_tokens": 22739864.0, "reward": -0.09375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8903597593307495, "sampling/importance_sampling_ratio/mean": 0.9992150664329529, "sampling/importance_sampling_ratio/min": 0.526161253452301, "sampling/sampling_logp_difference/max": 0.6421475410461426, "sampling/sampling_logp_difference/mean": 0.021756840869784355, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 138.4375, "completions/mean_terminated_length": 138.4375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3331761360168457, "epoch": 0.8848039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.610386235911436, "kl": 0.15505921840667725, "learning_rate": 8.89802860739326e-07, "loss": 0.0143, "num_tokens": 22769428.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009257793426514, "sampling/importance_sampling_ratio/min": 0.5311965346336365, "sampling/sampling_logp_difference/max": 0.7143645286560059, "sampling/sampling_logp_difference/mean": 0.018353819847106934, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 171.828125, "completions/mean_terminated_length": 171.828125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.35276901721954346, "epoch": 0.8860294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.2634663462798594, "kl": 0.12248918414115906, "learning_rate": 8.89356323018447e-07, "loss": -0.0115, "num_tokens": 22799625.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6889691352844238, "sampling/importance_sampling_ratio/mean": 1.000432014465332, "sampling/importance_sampling_ratio/min": 0.46902188658714294, "sampling/sampling_logp_difference/max": 0.757105827331543, "sampling/sampling_logp_difference/mean": 0.01768193021416664, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 124.453125, "completions/mean_terminated_length": 124.453125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3187045753002167, "epoch": 0.8872549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 2.6120365143953608, "kl": 0.2303055375814438, "learning_rate": 8.889089949288986e-07, "loss": -0.019, "num_tokens": 22821702.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.730854868888855, "sampling/importance_sampling_ratio/mean": 1.000281810760498, "sampling/importance_sampling_ratio/min": 0.6541826725006104, "sampling/sampling_logp_difference/max": 0.5486154556274414, "sampling/sampling_logp_difference/mean": 0.018116045743227005, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 132.796875, "completions/mean_terminated_length": 132.796875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.18036410212516785, "epoch": 0.8884803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.1824288888284623, "kl": 0.10626198351383209, "learning_rate": 8.884608773787288e-07, "loss": 0.001, "num_tokens": 22843641.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4594168663024902, "sampling/importance_sampling_ratio/mean": 1.0004578828811646, "sampling/importance_sampling_ratio/min": 0.49541574716567993, "sampling/sampling_logp_difference/max": 0.7023580074310303, "sampling/sampling_logp_difference/mean": 0.012088064104318619, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 172.65625, "completions/mean_terminated_length": 172.65625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.31260034441947937, "epoch": 0.8897058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.8754439548088546, "kl": 0.1252618134021759, "learning_rate": 8.880119712775875e-07, "loss": 0.0219, "num_tokens": 22873123.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.595409870147705, "sampling/importance_sampling_ratio/mean": 1.0009194612503052, "sampling/importance_sampling_ratio/min": 0.5038020610809326, "sampling/sampling_logp_difference/max": 0.6855719089508057, "sampling/sampling_logp_difference/mean": 0.015545779839158058, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 176.46875, "completions/mean_terminated_length": 176.46875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.35207128524780273, "epoch": 0.8909313725490197, "frac_reward_zero_std": 0.25, "grad_norm": 2.2894315350001015, "kl": 0.14700110256671906, "learning_rate": 8.875622775367259e-07, "loss": 0.0803, "num_tokens": 22900337.0, "reward": 0.84375, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6219651699066162, "sampling/importance_sampling_ratio/mean": 1.0002046823501587, "sampling/importance_sampling_ratio/min": 0.3306296169757843, "sampling/sampling_logp_difference/max": 1.106756567955017, "sampling/sampling_logp_difference/mean": 0.017628487199544907, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.442595899105072, "epoch": 0.8921568627450981, "frac_reward_zero_std": 0.25, "grad_norm": 1.9493746234582252, "kl": 0.12233205139636993, "learning_rate": 8.871117970689937e-07, "loss": 0.0071, "num_tokens": 22934025.0, "reward": 0.03125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995599985122681, "sampling/importance_sampling_ratio/min": 0.22002194821834564, "sampling/sampling_logp_difference/max": 1.5140279531478882, "sampling/sampling_logp_difference/mean": 0.020226042717695236, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 167.8125, "completions/mean_terminated_length": 167.8125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.273113489151001, "epoch": 0.8933823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 1.431421698045663, "kl": 0.12466256320476532, "learning_rate": 8.866605307888376e-07, "loss": 0.032, "num_tokens": 22961757.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6836470365524292, "sampling/importance_sampling_ratio/mean": 1.0004937648773193, "sampling/importance_sampling_ratio/min": 0.49846717715263367, "sampling/sampling_logp_difference/max": 0.6962175369262695, "sampling/sampling_logp_difference/mean": 0.014166696928441525, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 171.328125, "completions/mean_terminated_length": 171.328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3084203004837036, "epoch": 0.8946078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 2.1807505881028106, "kl": 0.1305488795042038, "learning_rate": 8.862084796122997e-07, "loss": 0.0412, "num_tokens": 22991938.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.775182843208313, "sampling/importance_sampling_ratio/mean": 1.000341773033142, "sampling/importance_sampling_ratio/min": 0.512789785861969, "sampling/sampling_logp_difference/max": 0.6678893566131592, "sampling/sampling_logp_difference/mean": 0.017210688441991806, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 133.984375, "completions/mean_terminated_length": 133.984375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3547746539115906, "epoch": 0.8958333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 1.8947466005816809, "kl": 0.17236892879009247, "learning_rate": 8.857556444570153e-07, "loss": -0.0089, "num_tokens": 23018289.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993709921836853, "sampling/importance_sampling_ratio/min": 0.5687522292137146, "sampling/sampling_logp_difference/max": 0.7245337963104248, "sampling/sampling_logp_difference/mean": 0.016643021255731583, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 160.734375, "completions/mean_terminated_length": 160.734375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3138841986656189, "epoch": 0.8970588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 1.344781421912304, "kl": 0.11245203018188477, "learning_rate": 8.853020262422109e-07, "loss": -0.0154, "num_tokens": 23042752.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7926790714263916, "sampling/importance_sampling_ratio/mean": 0.9999420642852783, "sampling/importance_sampling_ratio/min": 0.6133373975753784, "sampling/sampling_logp_difference/max": 0.5837111473083496, "sampling/sampling_logp_difference/mean": 0.015560884959995747, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 190.921875, "completions/mean_terminated_length": 190.921875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2579788565635681, "epoch": 0.8982843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.150261385713446, "kl": 0.10539811849594116, "learning_rate": 8.84847625888703e-07, "loss": 0.0217, "num_tokens": 23076363.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.621700406074524, "sampling/importance_sampling_ratio/mean": 0.999610424041748, "sampling/importance_sampling_ratio/min": 0.6068662405014038, "sampling/sampling_logp_difference/max": 0.4994468688964844, "sampling/sampling_logp_difference/mean": 0.012584343552589417, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.3112919330596924, "epoch": 0.8995098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 1.6945912421821459, "kl": 0.16028791666030884, "learning_rate": 8.843924443188953e-07, "loss": -0.0124, "num_tokens": 23104299.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.8255785703659058, "sampling/importance_sampling_ratio/mean": 0.9997628927230835, "sampling/importance_sampling_ratio/min": 0.39045611023902893, "sampling/sampling_logp_difference/max": 0.9404397010803223, "sampling/sampling_logp_difference/mean": 0.017352301627397537, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 166.328125, "completions/mean_terminated_length": 166.328125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3333016633987427, "epoch": 0.9007352941176471, "frac_reward_zero_std": 0.5, "grad_norm": 2.136442992868966, "kl": 0.19512274861335754, "learning_rate": 8.839364824567775e-07, "loss": -0.0368, "num_tokens": 23131616.0, "reward": 0.28125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.7447431087493896, "sampling/importance_sampling_ratio/mean": 0.9993703365325928, "sampling/importance_sampling_ratio/min": 0.5949843525886536, "sampling/sampling_logp_difference/max": 0.5566072463989258, "sampling/sampling_logp_difference/mean": 0.01808866485953331, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 239.25, "completions/mean_terminated_length": 239.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.39610356092453003, "epoch": 0.9019607843137255, "frac_reward_zero_std": 0.25, "grad_norm": 1.933601120837525, "kl": 0.13343805074691772, "learning_rate": 8.834797412279235e-07, "loss": -0.0563, "num_tokens": 23167712.0, "reward": 0.4375, "reward_std": 0.5738953948020935, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9925565719604492, "sampling/importance_sampling_ratio/mean": 1.0004668235778809, "sampling/importance_sampling_ratio/min": 0.4509410262107849, "sampling/sampling_logp_difference/max": 0.7964186668395996, "sampling/sampling_logp_difference/mean": 0.01696062460541725, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 131.859375, "completions/mean_terminated_length": 131.859375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3019437789916992, "epoch": 0.9031862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.5423557238277512, "kl": 0.19669732451438904, "learning_rate": 8.83022221559489e-07, "loss": -0.0215, "num_tokens": 23190471.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4498867988586426, "sampling/importance_sampling_ratio/mean": 1.0004997253417969, "sampling/importance_sampling_ratio/min": 0.6259177327156067, "sampling/sampling_logp_difference/max": 0.468536376953125, "sampling/sampling_logp_difference/mean": 0.015907293185591698, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 145.484375, "completions/mean_terminated_length": 145.484375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.30380311608314514, "epoch": 0.9044117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 1.355611611588332, "kl": 0.14590933918952942, "learning_rate": 8.825639243802098e-07, "loss": 0.0064, "num_tokens": 23222038.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.964614987373352, "sampling/importance_sampling_ratio/mean": 1.0001753568649292, "sampling/importance_sampling_ratio/min": 0.6057144403457642, "sampling/sampling_logp_difference/max": 0.6752963066101074, "sampling/sampling_logp_difference/mean": 0.01644541323184967, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2797512412071228, "epoch": 0.9056372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.4076118802180047, "kl": 0.10469253361225128, "learning_rate": 8.821048506204005e-07, "loss": -0.063, "num_tokens": 23247302.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5281046628952026, "sampling/importance_sampling_ratio/mean": 0.9992679357528687, "sampling/importance_sampling_ratio/min": 0.48816025257110596, "sampling/sampling_logp_difference/max": 0.7171115875244141, "sampling/sampling_logp_difference/mean": 0.013918949291110039, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 202.890625, "completions/mean_terminated_length": 202.890625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.38396427035331726, "epoch": 0.9068627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 1.2545918080625817, "kl": 0.142167866230011, "learning_rate": 8.816450012119513e-07, "loss": 0.0014, "num_tokens": 23283855.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002930164337158, "sampling/importance_sampling_ratio/min": 0.6144245266914368, "sampling/sampling_logp_difference/max": 0.7409529685974121, "sampling/sampling_logp_difference/mean": 0.01684897020459175, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 168.515625, "completions/mean_terminated_length": 168.515625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.288840115070343, "epoch": 0.9080882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 0.9517173861653833, "kl": 0.11101772636175156, "learning_rate": 8.811843770883276e-07, "loss": -0.0023, "num_tokens": 23312512.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999265670776367, "sampling/importance_sampling_ratio/min": 0.4923335611820221, "sampling/sampling_logp_difference/max": 0.7541918754577637, "sampling/sampling_logp_difference/mean": 0.014561197720468044, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 250.140625, "completions/mean_terminated_length": 250.140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.42110323905944824, "epoch": 0.9093137254901961, "frac_reward_zero_std": 0.25, "grad_norm": 1.944178334035574, "kl": 0.11443128436803818, "learning_rate": 8.807229791845671e-07, "loss": -0.0144, "num_tokens": 23348249.0, "reward": -0.03125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.815877079963684, "sampling/importance_sampling_ratio/mean": 1.000603437423706, "sampling/importance_sampling_ratio/min": 0.5373468995094299, "sampling/sampling_logp_difference/max": 0.6211113929748535, "sampling/sampling_logp_difference/mean": 0.017753958702087402, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 197.09375, "completions/mean_terminated_length": 197.09375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2682134509086609, "epoch": 0.9105392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.3822187220239395, "kl": 0.1007651761174202, "learning_rate": 8.802608084372785e-07, "loss": 0.0442, "num_tokens": 23382143.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6189647912979126, "sampling/importance_sampling_ratio/mean": 0.999918520450592, "sampling/importance_sampling_ratio/min": 0.5457268357276917, "sampling/sampling_logp_difference/max": 0.6056368350982666, "sampling/sampling_logp_difference/mean": 0.013500851579010487, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 224.984375, "completions/mean_terminated_length": 224.984375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3589177429676056, "epoch": 0.9117647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.9843702764558497, "kl": 0.11899229139089584, "learning_rate": 8.79797865784639e-07, "loss": -0.0247, "num_tokens": 23413582.0, "reward": 0.03125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.762001872062683, "sampling/importance_sampling_ratio/mean": 1.0001921653747559, "sampling/importance_sampling_ratio/min": 0.6599255204200745, "sampling/sampling_logp_difference/max": 0.5664505958557129, "sampling/sampling_logp_difference/mean": 0.015161126852035522, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 214.921875, "completions/mean_terminated_length": 214.921875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3481399714946747, "epoch": 0.9129901960784313, "frac_reward_zero_std": 0.75, "grad_norm": 1.37079857009415, "kl": 0.12129916250705719, "learning_rate": 8.793341521663928e-07, "loss": 0.0648, "num_tokens": 23446041.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5652804374694824, "sampling/importance_sampling_ratio/mean": 1.000514030456543, "sampling/importance_sampling_ratio/min": 0.5542005300521851, "sampling/sampling_logp_difference/max": 0.5902286767959595, "sampling/sampling_logp_difference/mean": 0.014764299616217613, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 179.90625, "completions/mean_terminated_length": 179.90625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.26453089714050293, "epoch": 0.9142156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.08713475356444839, "kl": 0.1045544445514679, "learning_rate": 8.788696685238494e-07, "loss": 0.001, "num_tokens": 23475891.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.577684998512268, "sampling/importance_sampling_ratio/mean": 0.9993813037872314, "sampling/importance_sampling_ratio/min": 0.5581960082054138, "sampling/sampling_logp_difference/max": 0.5830450057983398, "sampling/sampling_logp_difference/mean": 0.014570602215826511, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 188.265625, "completions/mean_terminated_length": 188.265625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.24562190473079681, "epoch": 0.9154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.05687146209703193, "kl": 0.10551153123378754, "learning_rate": 8.784044157998809e-07, "loss": 0.001, "num_tokens": 23502484.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4801042079925537, "sampling/importance_sampling_ratio/mean": 0.9999704360961914, "sampling/importance_sampling_ratio/min": 0.5629025101661682, "sampling/sampling_logp_difference/max": 0.5746488571166992, "sampling/sampling_logp_difference/mean": 0.012423046864569187, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 236.375, "completions/mean_terminated_length": 236.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4058482050895691, "epoch": 0.9166666666666666, "frac_reward_zero_std": 0.5, "grad_norm": 1.652487366830316, "kl": 0.14283131062984467, "learning_rate": 8.779383949389208e-07, "loss": -0.0465, "num_tokens": 23536764.0, "reward": 0.09375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.9442535638809204, "sampling/importance_sampling_ratio/mean": 1.0002634525299072, "sampling/importance_sampling_ratio/min": 0.03916797414422035, "sampling/sampling_logp_difference/max": 3.239895820617676, "sampling/sampling_logp_difference/mean": 0.018831366673111916, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 146.3125, "completions/mean_terminated_length": 146.3125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.23472276329994202, "epoch": 0.9178921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.601902455625568, "kl": 0.10527126491069794, "learning_rate": 8.774716068869623e-07, "loss": -0.013, "num_tokens": 23562448.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4908369779586792, "sampling/importance_sampling_ratio/mean": 0.9994587302207947, "sampling/importance_sampling_ratio/min": 0.5684351921081543, "sampling/sampling_logp_difference/max": 0.5648679733276367, "sampling/sampling_logp_difference/mean": 0.013005070388317108, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.35371267795562744, "epoch": 0.9191176470588235, "frac_reward_zero_std": 0.75, "grad_norm": 0.912311800476807, "kl": 0.06903313100337982, "learning_rate": 8.770040525915553e-07, "loss": 0.0118, "num_tokens": 23610000.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8112993240356445, "sampling/importance_sampling_ratio/mean": 1.0004202127456665, "sampling/importance_sampling_ratio/min": 0.5983051061630249, "sampling/sampling_logp_difference/max": 0.5940444469451904, "sampling/sampling_logp_difference/mean": 0.015579350292682648, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 188.421875, "completions/mean_terminated_length": 188.421875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.29587602615356445, "epoch": 0.9203431372549019, "frac_reward_zero_std": 0.75, "grad_norm": 1.1386399902225393, "kl": 0.1196604073047638, "learning_rate": 8.765357330018055e-07, "loss": -0.0425, "num_tokens": 23638843.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5930376052856445, "sampling/importance_sampling_ratio/mean": 1.0000560283660889, "sampling/importance_sampling_ratio/min": 0.552837610244751, "sampling/sampling_logp_difference/max": 0.5926909446716309, "sampling/sampling_logp_difference/mean": 0.014708933420479298, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 245.5625, "completions/mean_terminated_length": 245.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2548101842403412, "epoch": 0.9215686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 0.8398398133402648, "kl": 0.13636735081672668, "learning_rate": 8.760666490683719e-07, "loss": -0.0048, "num_tokens": 23671215.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6207329034805298, "sampling/importance_sampling_ratio/mean": 0.9996460676193237, "sampling/importance_sampling_ratio/min": 0.4399203658103943, "sampling/sampling_logp_difference/max": 0.8211615085601807, "sampling/sampling_logp_difference/mean": 0.013017473742365837, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3652108907699585, "epoch": 0.9227941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 1.1424372731621233, "kl": 0.1297123283147812, "learning_rate": 8.755968017434651e-07, "loss": -0.0019, "num_tokens": 23700487.0, "reward": -0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5887606143951416, "sampling/importance_sampling_ratio/mean": 1.0001184940338135, "sampling/importance_sampling_ratio/min": 0.6151662468910217, "sampling/sampling_logp_difference/max": 0.48586273193359375, "sampling/sampling_logp_difference/mean": 0.018013084307312965, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.27533072233200073, "epoch": 0.9240196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.047112638402082026, "kl": 0.08654429018497467, "learning_rate": 8.751261919808457e-07, "loss": 0.0009, "num_tokens": 23734519.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7003505229949951, "sampling/importance_sampling_ratio/mean": 1.0001380443572998, "sampling/importance_sampling_ratio/min": 0.39048299193382263, "sampling/sampling_logp_difference/max": 0.9403707981109619, "sampling/sampling_logp_difference/mean": 0.015848493203520775, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 266.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20762838423252106, "epoch": 0.9252450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8807952593732118, "kl": 0.08133986592292786, "learning_rate": 8.746548207358215e-07, "loss": 0.0097, "num_tokens": 23774951.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000663995742798, "sampling/importance_sampling_ratio/min": 0.5408735275268555, "sampling/sampling_logp_difference/max": 1.1860003471374512, "sampling/sampling_logp_difference/mean": 0.011015929281711578, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 258.359375, "completions/mean_terminated_length": 258.359375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.40661847591400146, "epoch": 0.9264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.3951694354347108, "kl": 0.13240307569503784, "learning_rate": 8.741826889652463e-07, "loss": -0.0076, "num_tokens": 23814590.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4788700342178345, "sampling/importance_sampling_ratio/mean": 0.9997360110282898, "sampling/importance_sampling_ratio/min": 0.3830121159553528, "sampling/sampling_logp_difference/max": 0.9596887230873108, "sampling/sampling_logp_difference/mean": 0.0171474888920784, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 258.71875, "completions/mean_terminated_length": 258.71875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.2582932710647583, "epoch": 0.9276960784313726, "frac_reward_zero_std": 0.25, "grad_norm": 1.5198818757485997, "kl": 0.11752104014158249, "learning_rate": 8.737097976275176e-07, "loss": 0.0292, "num_tokens": 23846860.0, "reward": 0.84375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6279054880142212, "sampling/importance_sampling_ratio/mean": 1.0000791549682617, "sampling/importance_sampling_ratio/min": 0.6158292293548584, "sampling/sampling_logp_difference/max": 0.48729419708251953, "sampling/sampling_logp_difference/mean": 0.012493856251239777, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 264.46875, "completions/mean_terminated_length": 264.46875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.34261465072631836, "epoch": 0.928921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.1119301867776166, "kl": 0.092156782746315, "learning_rate": 8.73236147682575e-07, "loss": 0.0132, "num_tokens": 23889082.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004080533981323, "sampling/importance_sampling_ratio/min": 0.43092551827430725, "sampling/sampling_logp_difference/max": 0.9247453212738037, "sampling/sampling_logp_difference/mean": 0.01624486595392227, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 276.203125, "completions/mean_terminated_length": 276.203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.223912313580513, "epoch": 0.9301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.3035182604405826, "kl": 0.07586102187633514, "learning_rate": 8.727617400918978e-07, "loss": -0.0102, "num_tokens": 23927271.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001288652420044, "sampling/importance_sampling_ratio/min": 0.3184730112552643, "sampling/sampling_logp_difference/max": 1.144217610359192, "sampling/sampling_logp_difference/mean": 0.011804968118667603, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 348.734375, "completions/mean_terminated_length": 348.734375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.30830830335617065, "epoch": 0.9313725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.052828823043434545, "kl": 0.0683603584766388, "learning_rate": 8.722865758185035e-07, "loss": 0.0006, "num_tokens": 23967398.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000340938568115, "sampling/importance_sampling_ratio/min": 0.2917536795139313, "sampling/sampling_logp_difference/max": 1.5527136325836182, "sampling/sampling_logp_difference/mean": 0.014943273738026619, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 348.96875, "completions/mean_terminated_length": 348.96875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.32950258255004883, "epoch": 0.9325980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.0608405048500464, "kl": 0.09067786484956741, "learning_rate": 8.718106558269452e-07, "loss": 0.0174, "num_tokens": 24008692.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998042583465576, "sampling/importance_sampling_ratio/min": 0.09122932702302933, "sampling/sampling_logp_difference/max": 2.394378900527954, "sampling/sampling_logp_difference/mean": 0.014266250655055046, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2407093495130539, "epoch": 0.9338235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 2.049874914237902, "kl": 0.07820361852645874, "learning_rate": 8.713339810833105e-07, "loss": 0.0515, "num_tokens": 24033780.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007174015045166, "sampling/importance_sampling_ratio/min": 0.571811854839325, "sampling/sampling_logp_difference/max": 0.7264003753662109, "sampling/sampling_logp_difference/mean": 0.014713791199028492, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 275.828125, "completions/mean_terminated_length": 275.828125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.2686575651168823, "epoch": 0.9350490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.3854418319610282, "kl": 0.09390707314014435, "learning_rate": 8.708565525552189e-07, "loss": -0.0016, "num_tokens": 24069913.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6036155223846436, "sampling/importance_sampling_ratio/mean": 0.9999788403511047, "sampling/importance_sampling_ratio/min": 0.18501514196395874, "sampling/sampling_logp_difference/max": 1.6873176097869873, "sampling/sampling_logp_difference/mean": 0.014508314430713654, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 317.296875, "completions/mean_terminated_length": 317.296875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.328898549079895, "epoch": 0.9362745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.948734908999543, "kl": 0.0919274240732193, "learning_rate": 8.703783712118202e-07, "loss": -0.008, "num_tokens": 24113628.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006535053253174, "sampling/importance_sampling_ratio/min": 0.2388659417629242, "sampling/sampling_logp_difference/max": 1.4318528175354004, "sampling/sampling_logp_difference/mean": 0.017194965854287148, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 261.609375, "completions/mean_terminated_length": 261.609375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.19261127710342407, "epoch": 0.9375, "frac_reward_zero_std": 1.0, "grad_norm": 0.05829228337489209, "kl": 0.06861470639705658, "learning_rate": 8.69899438023792e-07, "loss": 0.0006, "num_tokens": 24146531.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999251365661621, "sampling/importance_sampling_ratio/min": 0.5260617733001709, "sampling/sampling_logp_difference/max": 0.9457783699035645, "sampling/sampling_logp_difference/mean": 0.011656357906758785, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 400.125, "completions/mean_terminated_length": 400.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2249448448419571, "epoch": 0.9387254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.5009311419991561, "kl": 0.06286576390266418, "learning_rate": 8.694197539633385e-07, "loss": -0.0244, "num_tokens": 24192123.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9170788526535034, "sampling/importance_sampling_ratio/mean": 0.9997783899307251, "sampling/importance_sampling_ratio/min": 0.4161491394042969, "sampling/sampling_logp_difference/max": 0.8767116069793701, "sampling/sampling_logp_difference/mean": 0.010771805420517921, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 228.640625, "completions/mean_terminated_length": 228.640625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.32457515597343445, "epoch": 0.9399509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.4842880579057736, "kl": 0.10564848780632019, "learning_rate": 8.689393200041878e-07, "loss": 0.0219, "num_tokens": 24226564.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.689996361732483, "sampling/importance_sampling_ratio/mean": 1.0003066062927246, "sampling/importance_sampling_ratio/min": 0.4712963104248047, "sampling/sampling_logp_difference/max": 0.7522683143615723, "sampling/sampling_logp_difference/mean": 0.016728565096855164, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 326.328125, "completions/mean_terminated_length": 326.328125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3114994466304779, "epoch": 0.9411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.8765508643097657, "kl": 0.08524590730667114, "learning_rate": 8.684581371215904e-07, "loss": -0.0087, "num_tokens": 24271401.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.980911374092102, "sampling/importance_sampling_ratio/mean": 0.9998607039451599, "sampling/importance_sampling_ratio/min": 0.28986427187919617, "sampling/sampling_logp_difference/max": 1.238342523574829, "sampling/sampling_logp_difference/mean": 0.01525900699198246, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 206.6875, "completions/mean_terminated_length": 206.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.26448220014572144, "epoch": 0.9424019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.07077786446123661, "kl": 0.09754940867424011, "learning_rate": 8.679762062923175e-07, "loss": 0.001, "num_tokens": 24300901.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993366599082947, "sampling/importance_sampling_ratio/min": 0.5680510997772217, "sampling/sampling_logp_difference/max": 0.8461496829986572, "sampling/sampling_logp_difference/mean": 0.014535041525959969, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 275.359375, "completions/mean_terminated_length": 275.359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.1998107135295868, "epoch": 0.9436274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.04915317473340469, "kl": 0.07948745787143707, "learning_rate": 8.674935284946576e-07, "loss": 0.0007, "num_tokens": 24331724.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5431855916976929, "sampling/importance_sampling_ratio/mean": 0.9998654723167419, "sampling/importance_sampling_ratio/min": 0.6153616309165955, "sampling/sampling_logp_difference/max": 0.48554515838623047, "sampling/sampling_logp_difference/mean": 0.011757083237171173, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.18456797301769257, "epoch": 0.9448529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.057234599373731744, "kl": 0.06149008870124817, "learning_rate": 8.670101047084162e-07, "loss": 0.0006, "num_tokens": 24361972.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.759658694267273, "sampling/importance_sampling_ratio/mean": 1.0008488893508911, "sampling/importance_sampling_ratio/min": 0.5163919925689697, "sampling/sampling_logp_difference/max": 0.6608891487121582, "sampling/sampling_logp_difference/mean": 0.010988589376211166, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2671198844909668, "epoch": 0.946078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.1167158123492245, "kl": 0.07398052513599396, "learning_rate": 8.66525935914913e-07, "loss": -0.0101, "num_tokens": 24390348.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.642784595489502, "sampling/importance_sampling_ratio/mean": 0.9997448921203613, "sampling/importance_sampling_ratio/min": 0.6081196665763855, "sampling/sampling_logp_difference/max": 0.49738359451293945, "sampling/sampling_logp_difference/mean": 0.014165053144097328, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 243.6875, "completions/mean_terminated_length": 243.6875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.24403822422027588, "epoch": 0.9473039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.227076431898468, "kl": 0.07924221456050873, "learning_rate": 8.660410230969804e-07, "loss": 0.0109, "num_tokens": 24422008.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9937978982925415, "sampling/importance_sampling_ratio/mean": 0.9996981620788574, "sampling/importance_sampling_ratio/min": 0.5999762415885925, "sampling/sampling_logp_difference/max": 0.6900413036346436, "sampling/sampling_logp_difference/mean": 0.013857114128768444, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 213.59375, "completions/mean_terminated_length": 213.59375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.23578113317489624, "epoch": 0.9485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0583291347233092, "kl": 0.07998788356781006, "learning_rate": 8.655553672389599e-07, "loss": 0.0008, "num_tokens": 24451822.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5902295112609863, "sampling/importance_sampling_ratio/mean": 0.9997382164001465, "sampling/importance_sampling_ratio/min": 0.4708743989467621, "sampling/sampling_logp_difference/max": 0.7531639337539673, "sampling/sampling_logp_difference/mean": 0.014234479516744614, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27442288398742676, "epoch": 0.9497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.22505439861754856, "kl": 0.11774364113807678, "learning_rate": 8.650689693267026e-07, "loss": 0.0012, "num_tokens": 24486262.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995816946029663, "sampling/importance_sampling_ratio/min": 0.26457977294921875, "sampling/sampling_logp_difference/max": 1.3296124935150146, "sampling/sampling_logp_difference/mean": 0.01688530668616295, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 226.0625, "completions/mean_terminated_length": 226.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.25847139954566956, "epoch": 0.9509803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.1638385298942386, "kl": 0.11118605732917786, "learning_rate": 8.645818303475654e-07, "loss": 0.0134, "num_tokens": 24518186.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000941753387451, "sampling/importance_sampling_ratio/min": 0.4972367584705353, "sampling/sampling_logp_difference/max": 2.187211036682129, "sampling/sampling_logp_difference/mean": 0.014578125439584255, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 222.609375, "completions/mean_terminated_length": 222.609375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24409694969654083, "epoch": 0.9522058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.687171589855556, "kl": 0.09179659187793732, "learning_rate": 8.640939512904095e-07, "loss": -0.0488, "num_tokens": 24552257.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7985886335372925, "sampling/importance_sampling_ratio/mean": 0.9997256994247437, "sampling/importance_sampling_ratio/min": 0.4481438100337982, "sampling/sampling_logp_difference/max": 0.8026411533355713, "sampling/sampling_logp_difference/mean": 0.014489128254354, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 186.90625, "completions/mean_terminated_length": 186.90625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.22848191857337952, "epoch": 0.9534313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.1136037290768883, "kl": 0.08895926177501678, "learning_rate": 8.636053331455986e-07, "loss": 0.0008, "num_tokens": 24582123.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6115034818649292, "sampling/importance_sampling_ratio/mean": 0.9996715784072876, "sampling/importance_sampling_ratio/min": 0.273299902677536, "sampling/sampling_logp_difference/max": 1.2971855401992798, "sampling/sampling_logp_difference/mean": 0.01470221672207117, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 211.078125, "completions/mean_terminated_length": 211.078125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.23104828596115112, "epoch": 0.9546568627450981, "frac_reward_zero_std": 0.75, "grad_norm": 1.4125139299465415, "kl": 0.06623545289039612, "learning_rate": 8.631159769049964e-07, "loss": -0.003, "num_tokens": 24616768.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8640655279159546, "sampling/importance_sampling_ratio/mean": 1.000211477279663, "sampling/importance_sampling_ratio/min": 0.26969853043556213, "sampling/sampling_logp_difference/max": 1.310450553894043, "sampling/sampling_logp_difference/mean": 0.014722302556037903, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.294430136680603, "epoch": 0.9558823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.09517564557086731, "kl": 0.1045638769865036, "learning_rate": 8.626258835619653e-07, "loss": 0.0011, "num_tokens": 24645704.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6792397499084473, "sampling/importance_sampling_ratio/mean": 0.9997049570083618, "sampling/importance_sampling_ratio/min": 0.2991315722465515, "sampling/sampling_logp_difference/max": 1.206871747970581, "sampling/sampling_logp_difference/mean": 0.016724448651075363, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 335.984375, "completions/mean_terminated_length": 335.984375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.28677886724472046, "epoch": 0.9571078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.3262884170193108, "kl": 0.07529214024543762, "learning_rate": 8.621350541113636e-07, "loss": 0.0445, "num_tokens": 24685319.0, "reward": 0.0625, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7604070901870728, "sampling/importance_sampling_ratio/mean": 0.9992750287055969, "sampling/importance_sampling_ratio/min": 0.25306573510169983, "sampling/sampling_logp_difference/max": 1.3741059303283691, "sampling/sampling_logp_difference/mean": 0.014710287563502789, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.28004491329193115, "epoch": 0.9583333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 1.5659166278465684, "kl": 0.07568037509918213, "learning_rate": 8.616434895495439e-07, "loss": 0.0382, "num_tokens": 24710551.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5768544673919678, "sampling/importance_sampling_ratio/mean": 0.9995042681694031, "sampling/importance_sampling_ratio/min": 0.6098422408103943, "sampling/sampling_logp_difference/max": 0.4945549964904785, "sampling/sampling_logp_difference/mean": 0.014809216372668743, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 228.734375, "completions/mean_terminated_length": 228.734375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.27852103114128113, "epoch": 0.9595588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.7220481070785896, "kl": 0.08494658768177032, "learning_rate": 8.611511908743514e-07, "loss": -0.0265, "num_tokens": 24739846.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.607839584350586, "sampling/importance_sampling_ratio/mean": 0.9999562501907349, "sampling/importance_sampling_ratio/min": 0.4443697929382324, "sampling/sampling_logp_difference/max": 0.8110982179641724, "sampling/sampling_logp_difference/mean": 0.014362575486302376, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 194.84375, "completions/mean_terminated_length": 194.84375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.22808068990707397, "epoch": 0.9607843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.3911844185995597, "kl": 0.16934823989868164, "learning_rate": 8.606581590851208e-07, "loss": 0.011, "num_tokens": 24767356.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005059242248535, "sampling/importance_sampling_ratio/min": 0.44347065687179565, "sampling/sampling_logp_difference/max": 0.9142541885375977, "sampling/sampling_logp_difference/mean": 0.013863971456885338, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 196.09375, "completions/mean_terminated_length": 196.09375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.26508060097694397, "epoch": 0.9620098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 1.4015269869062008, "kl": 0.09637516736984253, "learning_rate": 8.601643951826758e-07, "loss": -0.0127, "num_tokens": 24799634.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003741979599, "sampling/importance_sampling_ratio/min": 0.3292045593261719, "sampling/sampling_logp_difference/max": 1.2245583534240723, "sampling/sampling_logp_difference/mean": 0.015394306741654873, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 173.09375, "completions/mean_terminated_length": 173.09375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2639704942703247, "epoch": 0.9632352941176471, "frac_reward_zero_std": 0.75, "grad_norm": 1.4386105462370795, "kl": 0.09897458553314209, "learning_rate": 8.596699001693255e-07, "loss": 0.0098, "num_tokens": 24826968.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000314712524414, "sampling/importance_sampling_ratio/min": 0.5619944930076599, "sampling/sampling_logp_difference/max": 0.7084627151489258, "sampling/sampling_logp_difference/mean": 0.01590142212808132, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 212.921875, "completions/mean_terminated_length": 212.921875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.24254243075847626, "epoch": 0.9644607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.05603446342313128, "kl": 0.10858361423015594, "learning_rate": 8.591746750488637e-07, "loss": 0.0009, "num_tokens": 24860307.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6227291822433472, "sampling/importance_sampling_ratio/mean": 0.9999048113822937, "sampling/importance_sampling_ratio/min": 0.5034974217414856, "sampling/sampling_logp_difference/max": 0.6861767768859863, "sampling/sampling_logp_difference/mean": 0.014957180246710777, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 172.734375, "completions/mean_terminated_length": 172.734375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.23348455131053925, "epoch": 0.9656862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 2.5485027589058595, "kl": 0.09483473002910614, "learning_rate": 8.58678720826566e-07, "loss": 0.0051, "num_tokens": 24888482.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000526905059814, "sampling/importance_sampling_ratio/min": 0.4056454002857208, "sampling/sampling_logp_difference/max": 0.9022759199142456, "sampling/sampling_logp_difference/mean": 0.014128241688013077, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 228.015625, "completions/mean_terminated_length": 228.015625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.18648967146873474, "epoch": 0.9669117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 1.0333494594034551, "kl": 0.052099764347076416, "learning_rate": 8.58182038509188e-07, "loss": -0.0773, "num_tokens": 24920851.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.606330394744873, "sampling/importance_sampling_ratio/mean": 0.9999731183052063, "sampling/importance_sampling_ratio/min": 0.6468685269355774, "sampling/sampling_logp_difference/max": 0.4739522933959961, "sampling/sampling_logp_difference/mean": 0.010594777762889862, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 203.140625, "completions/mean_terminated_length": 203.140625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.20786339044570923, "epoch": 0.9681372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 1.728660722184858, "kl": 0.1304941177368164, "learning_rate": 8.576846291049633e-07, "loss": 0.0276, "num_tokens": 24953852.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6535061597824097, "sampling/importance_sampling_ratio/mean": 0.99996417760849, "sampling/importance_sampling_ratio/min": 0.39900103211402893, "sampling/sampling_logp_difference/max": 0.9187912940979004, "sampling/sampling_logp_difference/mean": 0.013810346834361553, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 249.03125, "completions/mean_terminated_length": 249.03125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3435039520263672, "epoch": 0.9693627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 1.693010139356701, "kl": 0.16828186810016632, "learning_rate": 8.571864936236015e-07, "loss": -0.0077, "num_tokens": 24984206.0, "reward": -0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007719993591309, "sampling/importance_sampling_ratio/min": 0.5251613259315491, "sampling/sampling_logp_difference/max": 0.7147760391235352, "sampling/sampling_logp_difference/mean": 0.016702715307474136, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 196.78125, "completions/mean_terminated_length": 196.78125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.22374488413333893, "epoch": 0.9705882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 18.92520845751321, "kl": 0.09307081997394562, "learning_rate": 8.56687633076286e-07, "loss": 0.0112, "num_tokens": 25013152.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.89866304397583, "sampling/importance_sampling_ratio/mean": 1.0002086162567139, "sampling/importance_sampling_ratio/min": 0.5483723282814026, "sampling/sampling_logp_difference/max": 0.6411499977111816, "sampling/sampling_logp_difference/mean": 0.013037864118814468, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 175.953125, "completions/mean_terminated_length": 175.953125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.26841437816619873, "epoch": 0.9718137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 2.020253246235909, "kl": 0.15671822428703308, "learning_rate": 8.561880484756724e-07, "loss": 0.0147, "num_tokens": 25044749.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995671510696411, "sampling/importance_sampling_ratio/min": 0.33376604318618774, "sampling/sampling_logp_difference/max": 1.305298089981079, "sampling/sampling_logp_difference/mean": 0.01966940611600876, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 246.671875, "completions/mean_terminated_length": 246.671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2716260254383087, "epoch": 0.9730392156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.4791742571323687, "kl": 0.06522978842258453, "learning_rate": 8.556877408358854e-07, "loss": -0.031, "num_tokens": 25076616.0, "reward": -0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 0.9999372959136963, "sampling/importance_sampling_ratio/min": 0.4611970782279968, "sampling/sampling_logp_difference/max": 0.7739298343658447, "sampling/sampling_logp_difference/mean": 0.014122720807790756, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 224.296875, "completions/mean_terminated_length": 224.296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.331301748752594, "epoch": 0.9742647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.55834443887189, "kl": 0.10484770685434341, "learning_rate": 8.551867111725182e-07, "loss": -0.015, "num_tokens": 25107419.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6215503215789795, "sampling/importance_sampling_ratio/mean": 1.0009255409240723, "sampling/importance_sampling_ratio/min": 0.6292675733566284, "sampling/sampling_logp_difference/max": 0.4833827018737793, "sampling/sampling_logp_difference/mean": 0.01644054800271988, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 233.515625, "completions/mean_terminated_length": 233.515625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.31792929768562317, "epoch": 0.9754901960784313, "frac_reward_zero_std": 0.75, "grad_norm": 1.3828992801403757, "kl": 0.1063152477145195, "learning_rate": 8.546849605026288e-07, "loss": -0.007, "num_tokens": 25143484.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 0.9995904564857483, "sampling/importance_sampling_ratio/min": 0.2538815140724182, "sampling/sampling_logp_difference/max": 1.3708875179290771, "sampling/sampling_logp_difference/mean": 0.017424102872610092, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 223.65625, "completions/mean_terminated_length": 223.65625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.31672942638397217, "epoch": 0.9767156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.1051758347447356, "kl": 0.09138567000627518, "learning_rate": 8.541824898447397e-07, "loss": 0.0053, "num_tokens": 25179574.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006200075149536, "sampling/importance_sampling_ratio/min": 0.2614830732345581, "sampling/sampling_logp_difference/max": 1.3413857221603394, "sampling/sampling_logp_difference/mean": 0.016937199980020523, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 228.671875, "completions/mean_terminated_length": 228.671875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.25868871808052063, "epoch": 0.9779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.6750072897039248, "kl": 0.09289942681789398, "learning_rate": 8.536793002188343e-07, "loss": -0.0885, "num_tokens": 25212033.0, "reward": 0.21875, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6071306467056274, "sampling/importance_sampling_ratio/mean": 0.9996358752250671, "sampling/importance_sampling_ratio/min": 0.3208919167518616, "sampling/sampling_logp_difference/max": 1.1366509199142456, "sampling/sampling_logp_difference/mean": 0.01383034698665142, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 197.484375, "completions/mean_terminated_length": 197.484375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.33253058791160583, "epoch": 0.9791666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 1.3666806788141803, "kl": 0.10231654345989227, "learning_rate": 8.531753926463556e-07, "loss": -0.0154, "num_tokens": 25249680.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7784098386764526, "sampling/importance_sampling_ratio/mean": 0.9992489814758301, "sampling/importance_sampling_ratio/min": 0.4159637987613678, "sampling/sampling_logp_difference/max": 0.8771570920944214, "sampling/sampling_logp_difference/mean": 0.01705138012766838, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3406507670879364, "epoch": 0.9803921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.3752395077662523, "kl": 0.09344251453876495, "learning_rate": 8.526707681502043e-07, "loss": -0.0142, "num_tokens": 25297120.0, "reward": 0.40625, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7945382595062256, "sampling/importance_sampling_ratio/mean": 1.0003046989440918, "sampling/importance_sampling_ratio/min": 0.39636990427970886, "sampling/sampling_logp_difference/max": 0.9254074096679688, "sampling/sampling_logp_difference/mean": 0.015919357538223267, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 201.15625, "completions/mean_terminated_length": 201.15625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.22913354635238647, "epoch": 0.9816176470588235, "frac_reward_zero_std": 0.75, "grad_norm": 1.5541191621953963, "kl": 0.08892589807510376, "learning_rate": 8.521654277547361e-07, "loss": 0.0113, "num_tokens": 25330314.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002615451812744, "sampling/importance_sampling_ratio/min": 0.43362876772880554, "sampling/sampling_logp_difference/max": 0.835566520690918, "sampling/sampling_logp_difference/mean": 0.015550851821899414, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 196.515625, "completions/mean_terminated_length": 196.515625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.27712690830230713, "epoch": 0.9828431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.05341884815081776, "kl": 0.09480875730514526, "learning_rate": 8.516593724857597e-07, "loss": 0.0009, "num_tokens": 25361051.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8076027631759644, "sampling/importance_sampling_ratio/mean": 0.9997462034225464, "sampling/importance_sampling_ratio/min": 0.4924623370170593, "sampling/sampling_logp_difference/max": 0.7083373069763184, "sampling/sampling_logp_difference/mean": 0.01574290730059147, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 224.609375, "completions/mean_terminated_length": 224.609375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.32985979318618774, "epoch": 0.9840686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 1.6215617047141229, "kl": 0.13667967915534973, "learning_rate": 8.511526033705356e-07, "loss": 0.001, "num_tokens": 25393330.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.7628861665725708, "sampling/importance_sampling_ratio/mean": 0.9999837279319763, "sampling/importance_sampling_ratio/min": 0.5363121628761292, "sampling/sampling_logp_difference/max": 0.6230388879776001, "sampling/sampling_logp_difference/mean": 0.017135314643383026, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 259.96875, "completions/mean_terminated_length": 259.96875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.26032283902168274, "epoch": 0.9852941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.9860506551774538, "kl": 0.08530402183532715, "learning_rate": 8.506451214377728e-07, "loss": 0.066, "num_tokens": 25426640.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9493076801300049, "sampling/importance_sampling_ratio/mean": 0.9999415874481201, "sampling/importance_sampling_ratio/min": 0.47820186614990234, "sampling/sampling_logp_difference/max": 0.7377223968505859, "sampling/sampling_logp_difference/mean": 0.015485418029129505, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 199.078125, "completions/mean_terminated_length": 199.078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2238267958164215, "epoch": 0.9865196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.061919552971392476, "kl": 0.09052140265703201, "learning_rate": 8.501369277176274e-07, "loss": 0.0009, "num_tokens": 25461781.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000416040420532, "sampling/importance_sampling_ratio/min": 0.476005882024765, "sampling/sampling_logp_difference/max": 1.229488730430603, "sampling/sampling_logp_difference/mean": 0.01378423348069191, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.24778124690055847, "epoch": 0.9877450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.1377328553002952, "kl": 0.11969804763793945, "learning_rate": 8.496280232417007e-07, "loss": 0.0112, "num_tokens": 25499621.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000567436218262, "sampling/importance_sampling_ratio/min": 0.43894508481025696, "sampling/sampling_logp_difference/max": 0.9489450454711914, "sampling/sampling_logp_difference/mean": 0.014659663662314415, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 241.71875, "completions/mean_terminated_length": 241.71875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2911279797554016, "epoch": 0.9889705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.4861659736004562, "kl": 0.10022599250078201, "learning_rate": 8.491184090430363e-07, "loss": 0.0202, "num_tokens": 25531443.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6779921054840088, "sampling/importance_sampling_ratio/mean": 1.000156283378601, "sampling/importance_sampling_ratio/min": 0.3601919114589691, "sampling/sampling_logp_difference/max": 1.021118402481079, "sampling/sampling_logp_difference/mean": 0.015354365110397339, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 259.4375, "completions/mean_terminated_length": 259.4375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.23779773712158203, "epoch": 0.9901960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0426753369135231, "kl": 0.0802290141582489, "learning_rate": 8.48608086156119e-07, "loss": 0.0007, "num_tokens": 25567967.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005545616149902, "sampling/importance_sampling_ratio/min": 0.48663073778152466, "sampling/sampling_logp_difference/max": 1.4357918500900269, "sampling/sampling_logp_difference/mean": 0.014127079397439957, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 176.78125, "completions/mean_terminated_length": 176.78125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.22128558158874512, "epoch": 0.991421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.3516200128811913, "kl": 0.10866418480873108, "learning_rate": 8.480970556168717e-07, "loss": 0.0343, "num_tokens": 25591345.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6000850200653076, "sampling/importance_sampling_ratio/mean": 1.0005159378051758, "sampling/importance_sampling_ratio/min": 0.5678879618644714, "sampling/sampling_logp_difference/max": 0.565831184387207, "sampling/sampling_logp_difference/mean": 0.013683602213859558, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 310.96875, "completions/mean_terminated_length": 310.96875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.31794682145118713, "epoch": 0.9926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.5039799554179456, "kl": 0.08307865262031555, "learning_rate": 8.47585318462654e-07, "loss": -0.0748, "num_tokens": 25631343.0, "reward": -0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6089153289794922, "sampling/importance_sampling_ratio/mean": 1.0002610683441162, "sampling/importance_sampling_ratio/min": 0.4608144164085388, "sampling/sampling_logp_difference/max": 0.7747598886489868, "sampling/sampling_logp_difference/mean": 0.014763688668608665, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 294.59375, "completions/mean_terminated_length": 294.59375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.2942918837070465, "epoch": 0.9938725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 1.274001862638374, "kl": 0.058023639023303986, "learning_rate": 8.470728757322603e-07, "loss": 0.0185, "num_tokens": 25671749.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.65156888961792, "sampling/importance_sampling_ratio/mean": 1.0002224445343018, "sampling/importance_sampling_ratio/min": 0.556668758392334, "sampling/sampling_logp_difference/max": 0.585784912109375, "sampling/sampling_logp_difference/mean": 0.014879814349114895, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 194.515625, "completions/mean_terminated_length": 194.515625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.19599872827529907, "epoch": 0.9950980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.4377343040997632, "kl": 0.10231795907020569, "learning_rate": 8.465597284659163e-07, "loss": -0.0092, "num_tokens": 25697398.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5985956192016602, "sampling/importance_sampling_ratio/mean": 1.0001569986343384, "sampling/importance_sampling_ratio/min": 0.6593726277351379, "sampling/sampling_logp_difference/max": 0.46912550926208496, "sampling/sampling_logp_difference/mean": 0.011592017486691475, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 251.515625, "completions/mean_terminated_length": 251.515625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.23744173347949982, "epoch": 0.9963235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.9911021494469198, "kl": 0.10130766034126282, "learning_rate": 8.460458777052788e-07, "loss": 0.0229, "num_tokens": 25731127.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000287652015686, "sampling/importance_sampling_ratio/min": 0.3292010724544525, "sampling/sampling_logp_difference/max": 1.1110866069793701, "sampling/sampling_logp_difference/mean": 0.012608356773853302, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 270.484375, "completions/mean_terminated_length": 270.484375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.33210045099258423, "epoch": 0.9975490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 1.5617785593321691, "kl": 0.13967446982860565, "learning_rate": 8.455313244934324e-07, "loss": 0.0606, "num_tokens": 25768550.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.7586973905563354, "sampling/importance_sampling_ratio/mean": 0.9992921352386475, "sampling/importance_sampling_ratio/min": 0.48241546750068665, "sampling/sampling_logp_difference/max": 0.7289495468139648, "sampling/sampling_logp_difference/mean": 0.01595686748623848, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 367.5625, "completions/mean_terminated_length": 367.5625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2818757891654968, "epoch": 0.9987745098039216, "frac_reward_zero_std": 0.25, "grad_norm": 1.1640668352284114, "kl": 0.10220036655664444, "learning_rate": 8.450160698748871e-07, "loss": 0.0247, "num_tokens": 25808106.0, "reward": 0.40625, "reward_std": 0.5986068248748779, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7544389963150024, "sampling/importance_sampling_ratio/mean": 1.0004475116729736, "sampling/importance_sampling_ratio/min": 0.6172945499420166, "sampling/sampling_logp_difference/max": 0.562149167060852, "sampling/sampling_logp_difference/mean": 0.014506627805531025, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 211.359375, "completions/mean_terminated_length": 211.359375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2624770402908325, "epoch": 1.0, "frac_reward_zero_std": 0.75, "grad_norm": 1.7630636551547239, "kl": 0.13299322128295898, "learning_rate": 8.445001148955775e-07, "loss": 0.0602, "num_tokens": 25836497.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5277621746063232, "sampling/importance_sampling_ratio/mean": 0.9998353123664856, "sampling/importance_sampling_ratio/min": 0.49843165278434753, "sampling/sampling_logp_difference/max": 0.6962888240814209, "sampling/sampling_logp_difference/mean": 0.013662048615515232, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 190.421875, "completions/mean_terminated_length": 190.421875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3429885506629944, "epoch": 1.0012254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.1742526347707032, "kl": 0.1756715178489685, "learning_rate": 8.439834606028593e-07, "loss": 0.0144, "num_tokens": 25868172.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9485623836517334, "sampling/importance_sampling_ratio/mean": 1.0000386238098145, "sampling/importance_sampling_ratio/min": 0.14807981252670288, "sampling/sampling_logp_difference/max": 1.910003900527954, "sampling/sampling_logp_difference/mean": 0.019224446266889572, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 329.390625, "completions/mean_terminated_length": 329.390625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.2481330931186676, "epoch": 1.0024509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.1708314419743115, "kl": 0.08997122943401337, "learning_rate": 8.434661080455082e-07, "loss": 0.0489, "num_tokens": 25908165.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001662969589233, "sampling/importance_sampling_ratio/min": 0.3522055149078369, "sampling/sampling_logp_difference/max": 1.0435404777526855, "sampling/sampling_logp_difference/mean": 0.01300597470253706, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 167.546875, "completions/mean_terminated_length": 167.546875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.18984636664390564, "epoch": 1.0036764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.68223338459352, "kl": 0.12585148215293884, "learning_rate": 8.42948058273717e-07, "loss": 0.0144, "num_tokens": 25931880.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6631553173065186, "sampling/importance_sampling_ratio/mean": 0.999846339225769, "sampling/importance_sampling_ratio/min": 0.4810182452201843, "sampling/sampling_logp_difference/max": 0.7318501472473145, "sampling/sampling_logp_difference/mean": 0.012167340144515038, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 213.171875, "completions/mean_terminated_length": 213.171875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.29106903076171875, "epoch": 1.0049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.04732077554474528, "kl": 0.14875267446041107, "learning_rate": 8.424293123390938e-07, "loss": 0.0011, "num_tokens": 25961155.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998732805252075, "sampling/importance_sampling_ratio/min": 0.5073512196540833, "sampling/sampling_logp_difference/max": 1.0273196697235107, "sampling/sampling_logp_difference/mean": 0.016036391258239746, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 199.828125, "completions/mean_terminated_length": 199.828125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.31752485036849976, "epoch": 1.0061274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.646825993560745, "kl": 0.14398261904716492, "learning_rate": 8.4190987129466e-07, "loss": -0.0032, "num_tokens": 25992632.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999907612800598, "sampling/importance_sampling_ratio/min": 0.38680943846702576, "sampling/sampling_logp_difference/max": 0.9498231410980225, "sampling/sampling_logp_difference/mean": 0.018284928053617477, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 240.09375, "completions/mean_terminated_length": 240.09375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.300971657037735, "epoch": 1.0073529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.410952001223642, "kl": 0.08495702594518661, "learning_rate": 8.413897361948483e-07, "loss": 0.0394, "num_tokens": 26025518.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5744410753250122, "sampling/importance_sampling_ratio/mean": 1.0003352165222168, "sampling/importance_sampling_ratio/min": 0.5256655216217041, "sampling/sampling_logp_difference/max": 0.6430901288986206, "sampling/sampling_logp_difference/mean": 0.015221796929836273, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 267.625, "completions/mean_terminated_length": 267.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2923009693622589, "epoch": 1.008578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.5553399319783574, "kl": 0.079134002327919, "learning_rate": 8.408689080954997e-07, "loss": 0.03, "num_tokens": 26063638.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6193829774856567, "sampling/importance_sampling_ratio/mean": 1.0001189708709717, "sampling/importance_sampling_ratio/min": 0.522817850112915, "sampling/sampling_logp_difference/max": 0.648522138595581, "sampling/sampling_logp_difference/mean": 0.014686044305562973, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 231.703125, "completions/mean_terminated_length": 231.703125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.23314502835273743, "epoch": 1.0098039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.3422052934420183, "kl": 0.07978282868862152, "learning_rate": 8.403473880538625e-07, "loss": 0.0162, "num_tokens": 26098547.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001015663146973, "sampling/importance_sampling_ratio/min": 0.2892739176750183, "sampling/sampling_logp_difference/max": 1.2403812408447266, "sampling/sampling_logp_difference/mean": 0.01295526884496212, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 187.78125, "completions/mean_terminated_length": 187.78125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3091316223144531, "epoch": 1.0110294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.8914285588470658, "kl": 0.11706522852182388, "learning_rate": 8.398251771285892e-07, "loss": 0.0428, "num_tokens": 26134501.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.8952840566635132, "sampling/importance_sampling_ratio/mean": 0.9994056820869446, "sampling/importance_sampling_ratio/min": 0.28077879548072815, "sampling/sampling_logp_difference/max": 1.2701880931854248, "sampling/sampling_logp_difference/mean": 0.017271850258111954, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 265.40625, "completions/mean_terminated_length": 265.40625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3018098771572113, "epoch": 1.0122549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 2.111976284918549, "kl": 0.08650106191635132, "learning_rate": 8.393022763797346e-07, "loss": -0.0658, "num_tokens": 26166671.0, "reward": 0.5, "reward_std": 0.6393726468086243, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9298874139785767, "sampling/importance_sampling_ratio/mean": 0.9999447464942932, "sampling/importance_sampling_ratio/min": 0.308843731880188, "sampling/sampling_logp_difference/max": 1.174919843673706, "sampling/sampling_logp_difference/mean": 0.015505645424127579, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 159.0625, "completions/mean_terminated_length": 159.0625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2503647804260254, "epoch": 1.0134803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.527716455285949, "kl": 0.16784119606018066, "learning_rate": 8.387786868687548e-07, "loss": 0.0194, "num_tokens": 26188771.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8835201263427734, "sampling/importance_sampling_ratio/mean": 1.0008383989334106, "sampling/importance_sampling_ratio/min": 0.5911896824836731, "sampling/sampling_logp_difference/max": 0.6331424713134766, "sampling/sampling_logp_difference/mean": 0.015132634900510311, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 259.28125, "completions/mean_terminated_length": 259.28125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3421180844306946, "epoch": 1.0147058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.8193258955723857, "kl": 0.08924973011016846, "learning_rate": 8.382544096585026e-07, "loss": 0.055, "num_tokens": 26220629.0, "reward": -0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5263513326644897, "sampling/importance_sampling_ratio/mean": 1.0001721382141113, "sampling/importance_sampling_ratio/min": 0.43096065521240234, "sampling/sampling_logp_difference/max": 0.8417384624481201, "sampling/sampling_logp_difference/mean": 0.017179038375616074, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.25044241547584534, "epoch": 1.0159313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.04693399444492541, "kl": 0.05882733687758446, "learning_rate": 8.37729445813228e-07, "loss": 0.0006, "num_tokens": 26256973.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.762977123260498, "sampling/importance_sampling_ratio/mean": 1.0009088516235352, "sampling/importance_sampling_ratio/min": 0.4902009963989258, "sampling/sampling_logp_difference/max": 0.7129397392272949, "sampling/sampling_logp_difference/mean": 0.014279183931648731, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 208.046875, "completions/mean_terminated_length": 208.046875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2281491756439209, "epoch": 1.017156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.511100354633253, "kl": 0.07732020318508148, "learning_rate": 8.372037963985741e-07, "loss": -0.0297, "num_tokens": 26293424.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6686607599258423, "sampling/importance_sampling_ratio/mean": 1.0004651546478271, "sampling/importance_sampling_ratio/min": 0.31634464859962463, "sampling/sampling_logp_difference/max": 1.1509230136871338, "sampling/sampling_logp_difference/mean": 0.014121164567768574, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 254.609375, "completions/mean_terminated_length": 254.609375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.21974888443946838, "epoch": 1.0183823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.05270365694449295, "kl": 0.07207030057907104, "learning_rate": 8.366774624815761e-07, "loss": 0.0007, "num_tokens": 26330711.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.626305341720581, "sampling/importance_sampling_ratio/mean": 0.9997680187225342, "sampling/importance_sampling_ratio/min": 0.28228700160980225, "sampling/sampling_logp_difference/max": 1.2648309469223022, "sampling/sampling_logp_difference/mean": 0.013425543904304504, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 166.84375, "completions/mean_terminated_length": 166.84375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.2337384819984436, "epoch": 1.0196078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 1.526393532776338, "kl": 0.10532155632972717, "learning_rate": 8.361504451306584e-07, "loss": -0.0247, "num_tokens": 26363629.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6090630292892456, "sampling/importance_sampling_ratio/mean": 1.0000205039978027, "sampling/importance_sampling_ratio/min": 0.528343677520752, "sampling/sampling_logp_difference/max": 0.6380083560943604, "sampling/sampling_logp_difference/mean": 0.013625487685203552, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 203.984375, "completions/mean_terminated_length": 203.984375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2802252769470215, "epoch": 1.0208333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 2.084559533474804, "kl": 0.11052876710891724, "learning_rate": 8.356227454156328e-07, "loss": -0.0214, "num_tokens": 26392588.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.9827526807785034, "sampling/importance_sampling_ratio/mean": 0.999845564365387, "sampling/importance_sampling_ratio/min": 0.37227943539619446, "sampling/sampling_logp_difference/max": 0.9881105422973633, "sampling/sampling_logp_difference/mean": 0.016211293637752533, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 207.015625, "completions/mean_terminated_length": 207.015625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.27637845277786255, "epoch": 1.0220588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.158562184915467, "kl": 0.09916109591722488, "learning_rate": 8.350943644076964e-07, "loss": 0.007, "num_tokens": 26422253.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004613399505615, "sampling/importance_sampling_ratio/min": 0.49313226342201233, "sampling/sampling_logp_difference/max": 0.7555751800537109, "sampling/sampling_logp_difference/mean": 0.014827284030616283, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 184.21875, "completions/mean_terminated_length": 184.21875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.210306316614151, "epoch": 1.0232843137254901, "frac_reward_zero_std": 0.75, "grad_norm": 1.4539889291402033, "kl": 0.12316978722810745, "learning_rate": 8.34565303179429e-07, "loss": 0.0, "num_tokens": 26449435.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4506796598434448, "sampling/importance_sampling_ratio/mean": 1.000178337097168, "sampling/importance_sampling_ratio/min": 0.6327508687973022, "sampling/sampling_logp_difference/max": 0.45767855644226074, "sampling/sampling_logp_difference/mean": 0.01168773416429758, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3543969988822937, "epoch": 1.0245098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.8453556747320308, "kl": 0.16368205845355988, "learning_rate": 8.340355628047917e-07, "loss": -0.0006, "num_tokens": 26481067.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.657509684562683, "sampling/importance_sampling_ratio/mean": 0.9992981553077698, "sampling/importance_sampling_ratio/min": 0.48332542181015015, "sampling/sampling_logp_difference/max": 0.7270650863647461, "sampling/sampling_logp_difference/mean": 0.01765834540128708, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 200.203125, "completions/mean_terminated_length": 200.203125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3506966829299927, "epoch": 1.025735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.057758914875663606, "kl": 0.0971386507153511, "learning_rate": 8.335051443591234e-07, "loss": 0.001, "num_tokens": 26513000.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6982983350753784, "sampling/importance_sampling_ratio/mean": 0.9998024106025696, "sampling/importance_sampling_ratio/min": 0.523671567440033, "sampling/sampling_logp_difference/max": 0.6468906402587891, "sampling/sampling_logp_difference/mean": 0.01773739606142044, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 207.4375, "completions/mean_terminated_length": 207.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1751493513584137, "epoch": 1.0269607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.03497628294846244, "kl": 0.04738932102918625, "learning_rate": 8.329740489191405e-07, "loss": 0.0005, "num_tokens": 26541892.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9018902778625488, "sampling/importance_sampling_ratio/mean": 1.0002743005752563, "sampling/importance_sampling_ratio/min": 0.5053152441978455, "sampling/sampling_logp_difference/max": 0.6825728416442871, "sampling/sampling_logp_difference/mean": 0.010526393540203571, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.33903390169143677, "epoch": 1.0281862745098038, "frac_reward_zero_std": 0.0, "grad_norm": 1.857413451748275, "kl": 0.09599026292562485, "learning_rate": 8.324422775629327e-07, "loss": 0.069, "num_tokens": 26582940.0, "reward": 0.46875, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000169277191162, "sampling/importance_sampling_ratio/min": 0.42311522364616394, "sampling/sampling_logp_difference/max": 1.192920446395874, "sampling/sampling_logp_difference/mean": 0.016587097197771072, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 154.65625, "completions/mean_terminated_length": 154.65625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.18241757154464722, "epoch": 1.0294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0584323257739424, "kl": 0.08496911823749542, "learning_rate": 8.319098313699624e-07, "loss": 0.0009, "num_tokens": 26612262.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7230578660964966, "sampling/importance_sampling_ratio/mean": 0.9996199607849121, "sampling/importance_sampling_ratio/min": 0.5690866708755493, "sampling/sampling_logp_difference/max": 0.5637224912643433, "sampling/sampling_logp_difference/mean": 0.012194900773465633, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 223.96875, "completions/mean_terminated_length": 223.96875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3323877453804016, "epoch": 1.0306372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 1.6835755773075476, "kl": 0.0954112634062767, "learning_rate": 8.313767114210615e-07, "loss": 0.0106, "num_tokens": 26653860.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.684166431427002, "sampling/importance_sampling_ratio/mean": 0.9999719858169556, "sampling/importance_sampling_ratio/min": 0.6183339953422546, "sampling/sampling_logp_difference/max": 0.521270751953125, "sampling/sampling_logp_difference/mean": 0.015068236738443375, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 176.40625, "completions/mean_terminated_length": 176.40625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.22423449158668518, "epoch": 1.031862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 2.41860184529582, "kl": 0.12526249885559082, "learning_rate": 8.308429187984298e-07, "loss": 0.0318, "num_tokens": 26679870.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6323018074035645, "sampling/importance_sampling_ratio/mean": 1.000128149986267, "sampling/importance_sampling_ratio/min": 0.36600685119628906, "sampling/sampling_logp_difference/max": 1.0051032304763794, "sampling/sampling_logp_difference/mean": 0.014712570235133171, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 271.34375, "completions/mean_terminated_length": 271.34375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3061140775680542, "epoch": 1.0330882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.7385203158897113, "kl": 0.08549848198890686, "learning_rate": 8.303084545856322e-07, "loss": 0.0283, "num_tokens": 26723412.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.7838666439056396, "sampling/importance_sampling_ratio/mean": 1.0001444816589355, "sampling/importance_sampling_ratio/min": 0.4810205101966858, "sampling/sampling_logp_difference/max": 0.7318453788757324, "sampling/sampling_logp_difference/mean": 0.015281646512448788, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 277.90625, "completions/mean_terminated_length": 277.90625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2246057689189911, "epoch": 1.0343137254901962, "frac_reward_zero_std": 0.75, "grad_norm": 0.8822044129650618, "kl": 0.09816767275333405, "learning_rate": 8.297733198675977e-07, "loss": 0.0072, "num_tokens": 26763022.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005570650100708, "sampling/importance_sampling_ratio/min": 0.48236799240112305, "sampling/sampling_logp_difference/max": 0.8305244445800781, "sampling/sampling_logp_difference/mean": 0.013231072574853897, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 197.53125, "completions/mean_terminated_length": 197.53125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3606891632080078, "epoch": 1.0355392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.9486747923437764, "kl": 0.13007526099681854, "learning_rate": 8.292375157306155e-07, "loss": 0.006, "num_tokens": 26795600.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5576754808425903, "sampling/importance_sampling_ratio/mean": 0.9999874830245972, "sampling/importance_sampling_ratio/min": 0.6090654134750366, "sampling/sampling_logp_difference/max": 0.49582958221435547, "sampling/sampling_logp_difference/mean": 0.01631821319460869, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 193.78125, "completions/mean_terminated_length": 193.78125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21424145996570587, "epoch": 1.036764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.170578415400301, "kl": 0.0991288274526596, "learning_rate": 8.287010432623343e-07, "loss": 0.0082, "num_tokens": 26823602.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6255465745925903, "sampling/importance_sampling_ratio/mean": 0.9990866184234619, "sampling/importance_sampling_ratio/min": 0.3958947956562042, "sampling/sampling_logp_difference/max": 0.9266067743301392, "sampling/sampling_logp_difference/mean": 0.014311027713119984, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 208.078125, "completions/mean_terminated_length": 208.078125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.19055885076522827, "epoch": 1.0379901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.8379166457781339, "kl": 0.12437820434570312, "learning_rate": 8.281639035517591e-07, "loss": -0.0026, "num_tokens": 26851303.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6019763946533203, "sampling/importance_sampling_ratio/mean": 0.9999380707740784, "sampling/importance_sampling_ratio/min": 0.3743131160736084, "sampling/sampling_logp_difference/max": 0.9826626777648926, "sampling/sampling_logp_difference/mean": 0.011997069232165813, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.18193748593330383, "epoch": 1.0392156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.08189642669918944, "kl": 0.0676099956035614, "learning_rate": 8.276260976892495e-07, "loss": 0.0006, "num_tokens": 26885919.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999856352806091, "sampling/importance_sampling_ratio/min": 0.4021023213863373, "sampling/sampling_logp_difference/max": 0.9808192253112793, "sampling/sampling_logp_difference/mean": 0.014291755855083466, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.23510439693927765, "epoch": 1.0404411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.05287333372937939, "kl": 0.09308743476867676, "learning_rate": 8.270876267665173e-07, "loss": 0.0008, "num_tokens": 26921551.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.678462266921997, "sampling/importance_sampling_ratio/mean": 1.0004521608352661, "sampling/importance_sampling_ratio/min": 0.3964764177799225, "sampling/sampling_logp_difference/max": 0.9251387715339661, "sampling/sampling_logp_difference/mean": 0.014521561563014984, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 190.859375, "completions/mean_terminated_length": 190.859375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2246999889612198, "epoch": 1.0416666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 1.688193795113662, "kl": 0.1231057196855545, "learning_rate": 8.265484918766242e-07, "loss": -0.0245, "num_tokens": 26946950.0, "reward": -0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6586134433746338, "sampling/importance_sampling_ratio/mean": 0.9999523758888245, "sampling/importance_sampling_ratio/min": 0.43952205777168274, "sampling/sampling_logp_difference/max": 0.822067379951477, "sampling/sampling_logp_difference/mean": 0.01355016976594925, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 174.859375, "completions/mean_terminated_length": 174.859375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24015727639198303, "epoch": 1.0428921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.937277069742771, "kl": 0.10738003253936768, "learning_rate": 8.260086941139804e-07, "loss": 0.0468, "num_tokens": 26981453.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998477697372437, "sampling/importance_sampling_ratio/min": 0.349877268075943, "sampling/sampling_logp_difference/max": 1.4946722984313965, "sampling/sampling_logp_difference/mean": 0.017771970480680466, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 166.984375, "completions/mean_terminated_length": 166.984375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3333693742752075, "epoch": 1.0441176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 2.1691347170068873, "kl": 0.25931957364082336, "learning_rate": 8.254682345743405e-07, "loss": -0.0161, "num_tokens": 27008732.0, "reward": 0.0, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006260871887207, "sampling/importance_sampling_ratio/min": 0.3316207826137543, "sampling/sampling_logp_difference/max": 1.1143497228622437, "sampling/sampling_logp_difference/mean": 0.019735675305128098, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 234.78125, "completions/mean_terminated_length": 234.78125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.23807714879512787, "epoch": 1.045343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.9915526582208206, "kl": 0.07801719009876251, "learning_rate": 8.249271143548036e-07, "loss": -0.0117, "num_tokens": 27043742.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002737045288086, "sampling/importance_sampling_ratio/min": 0.3906378149986267, "sampling/sampling_logp_difference/max": 1.68257474899292, "sampling/sampling_logp_difference/mean": 0.014691174030303955, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 224.8125, "completions/mean_terminated_length": 224.8125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24750757217407227, "epoch": 1.0465686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.1995606231439424, "kl": 0.08733342587947845, "learning_rate": 8.243853345538093e-07, "loss": 0.003, "num_tokens": 27080866.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.817976713180542, "sampling/importance_sampling_ratio/mean": 0.9999638199806213, "sampling/importance_sampling_ratio/min": 0.2541900873184204, "sampling/sampling_logp_difference/max": 1.3696728944778442, "sampling/sampling_logp_difference/mean": 0.015378719195723534, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 230.296875, "completions/mean_terminated_length": 230.296875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.21870526671409607, "epoch": 1.0477941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 1.3422440007440832, "kl": 0.065306156873703, "learning_rate": 8.238428962711362e-07, "loss": 0.0003, "num_tokens": 27112917.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5593341588974, "sampling/importance_sampling_ratio/mean": 1.000219702720642, "sampling/importance_sampling_ratio/min": 0.5015109777450562, "sampling/sampling_logp_difference/max": 0.6901297569274902, "sampling/sampling_logp_difference/mean": 0.013234477490186691, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2828371226787567, "epoch": 1.0490196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 1.2205511854288067, "kl": 0.08889597654342651, "learning_rate": 8.232998006078997e-07, "loss": 0.1483, "num_tokens": 27150403.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000083446502686, "sampling/importance_sampling_ratio/min": 0.10448633879423141, "sampling/sampling_logp_difference/max": 2.2586989402770996, "sampling/sampling_logp_difference/mean": 0.015552111901342869, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 219.546875, "completions/mean_terminated_length": 219.546875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.16488586366176605, "epoch": 1.0502450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.18047760725524792, "kl": 0.08904959261417389, "learning_rate": 8.227560486665498e-07, "loss": 0.0008, "num_tokens": 27181606.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006948709487915, "sampling/importance_sampling_ratio/min": 0.4159519672393799, "sampling/sampling_logp_difference/max": 0.8771854639053345, "sampling/sampling_logp_difference/mean": 0.011130171827971935, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 159.03125, "completions/mean_terminated_length": 159.03125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.22817040979862213, "epoch": 1.0514705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.2812865168860976, "kl": 0.16115710139274597, "learning_rate": 8.222116415508682e-07, "loss": 0.0049, "num_tokens": 27207736.0, "reward": -0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.7246307134628296, "sampling/importance_sampling_ratio/mean": 1.0001046657562256, "sampling/importance_sampling_ratio/min": 0.39628320932388306, "sampling/sampling_logp_difference/max": 0.9256261587142944, "sampling/sampling_logp_difference/mean": 0.015050392597913742, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 185.578125, "completions/mean_terminated_length": 185.578125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.15592145919799805, "epoch": 1.0526960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.13370403710709478, "kl": 0.06281337887048721, "learning_rate": 8.21666580365967e-07, "loss": 0.0007, "num_tokens": 27243213.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000174045562744, "sampling/importance_sampling_ratio/min": 0.46774056553840637, "sampling/sampling_logp_difference/max": 0.7598414421081543, "sampling/sampling_logp_difference/mean": 0.011581188067793846, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 199.828125, "completions/mean_terminated_length": 199.828125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2729835510253906, "epoch": 1.053921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.09925805994288021, "kl": 0.0949406623840332, "learning_rate": 8.211208662182858e-07, "loss": 0.0009, "num_tokens": 27277122.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5280382633209229, "sampling/importance_sampling_ratio/mean": 0.9983392953872681, "sampling/importance_sampling_ratio/min": 0.32965970039367676, "sampling/sampling_logp_difference/max": 1.1096943616867065, "sampling/sampling_logp_difference/mean": 0.01671024225652218, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 148.546875, "completions/mean_terminated_length": 148.546875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.24493072926998138, "epoch": 1.0551470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.11044530073772146, "kl": 0.11993096768856049, "learning_rate": 8.205745002155899e-07, "loss": 0.0012, "num_tokens": 27305077.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7267796993255615, "sampling/importance_sampling_ratio/mean": 0.9999323487281799, "sampling/importance_sampling_ratio/min": 0.4598299264907837, "sampling/sampling_logp_difference/max": 0.7768986225128174, "sampling/sampling_logp_difference/mean": 0.017437398433685303, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 182.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.22966301441192627, "epoch": 1.0563725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.735345885320772, "kl": 0.08656550943851471, "learning_rate": 8.200274834669675e-07, "loss": 0.0161, "num_tokens": 27332001.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5988582372665405, "sampling/importance_sampling_ratio/mean": 1.0000367164611816, "sampling/importance_sampling_ratio/min": 0.5221561789512634, "sampling/sampling_logp_difference/max": 0.6497886180877686, "sampling/sampling_logp_difference/mean": 0.013142306357622147, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 182.453125, "completions/mean_terminated_length": 182.453125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.20047461986541748, "epoch": 1.0575980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.6184519494389003, "kl": 0.06266318261623383, "learning_rate": 8.194798170828279e-07, "loss": 0.1024, "num_tokens": 27363582.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9467194080352783, "sampling/importance_sampling_ratio/mean": 0.9996281266212463, "sampling/importance_sampling_ratio/min": 0.35779374837875366, "sampling/sampling_logp_difference/max": 1.0277986526489258, "sampling/sampling_logp_difference/mean": 0.014652173034846783, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 221.09375, "completions/mean_terminated_length": 221.09375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1975773274898529, "epoch": 1.0588235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.2620075077714465, "kl": 0.07988125085830688, "learning_rate": 8.189315021748993e-07, "loss": 0.0345, "num_tokens": 27394596.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7520551681518555, "sampling/importance_sampling_ratio/mean": 0.9999421238899231, "sampling/importance_sampling_ratio/min": 0.5676584839820862, "sampling/sampling_logp_difference/max": 0.5662353038787842, "sampling/sampling_logp_difference/mean": 0.012207001447677612, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 237.0625, "completions/mean_terminated_length": 237.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.24505576491355896, "epoch": 1.0600490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.23068271821984426, "kl": 0.07782909274101257, "learning_rate": 8.183825398562263e-07, "loss": 0.0007, "num_tokens": 27427560.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7276512384414673, "sampling/importance_sampling_ratio/mean": 0.9997975826263428, "sampling/importance_sampling_ratio/min": 0.47389957308769226, "sampling/sampling_logp_difference/max": 0.7467598915100098, "sampling/sampling_logp_difference/mean": 0.01503431424498558, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 131.546875, "completions/mean_terminated_length": 131.546875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.24680377542972565, "epoch": 1.0612745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.13161481041915632, "kl": 0.10586653649806976, "learning_rate": 8.178329312411676e-07, "loss": 0.0011, "num_tokens": 27454363.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6429011821746826, "sampling/importance_sampling_ratio/mean": 1.0001440048217773, "sampling/importance_sampling_ratio/min": 0.4956373870372772, "sampling/sampling_logp_difference/max": 0.7019107341766357, "sampling/sampling_logp_difference/mean": 0.015495985746383667, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 182.8125, "completions/mean_terminated_length": 182.8125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.208645761013031, "epoch": 1.0625, "frac_reward_zero_std": 0.5, "grad_norm": 2.006884775754296, "kl": 0.08351355791091919, "learning_rate": 8.172826774453936e-07, "loss": 0.043, "num_tokens": 27478399.0, "reward": -0.3125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.8095623254776, "sampling/importance_sampling_ratio/mean": 0.9995898008346558, "sampling/importance_sampling_ratio/min": 0.6079089045524597, "sampling/sampling_logp_difference/max": 0.5930850505828857, "sampling/sampling_logp_difference/mean": 0.01453950721770525, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 167.921875, "completions/mean_terminated_length": 167.921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2757169008255005, "epoch": 1.0637254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.8902161735480165, "kl": 0.11209302395582199, "learning_rate": 8.16731779585885e-07, "loss": 0.0087, "num_tokens": 27513754.0, "reward": -0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000875473022461, "sampling/importance_sampling_ratio/min": 0.3849431276321411, "sampling/sampling_logp_difference/max": 0.9546597003936768, "sampling/sampling_logp_difference/mean": 0.018580183386802673, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 167.03125, "completions/mean_terminated_length": 167.03125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.21587997674942017, "epoch": 1.0649509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.1420870386111802, "kl": 0.08559355139732361, "learning_rate": 8.161802387809292e-07, "loss": 0.0008, "num_tokens": 27542732.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.859655737876892, "sampling/importance_sampling_ratio/mean": 1.000511646270752, "sampling/importance_sampling_ratio/min": 0.4743329584598541, "sampling/sampling_logp_difference/max": 0.7458457946777344, "sampling/sampling_logp_difference/mean": 0.015155967324972153, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2327881157398224, "epoch": 1.0661764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.1141680236873273, "kl": 0.07259321212768555, "learning_rate": 8.156280561501194e-07, "loss": 0.0071, "num_tokens": 27581308.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6321566104888916, "sampling/importance_sampling_ratio/mean": 1.00043523311615, "sampling/importance_sampling_ratio/min": 0.3766418993473053, "sampling/sampling_logp_difference/max": 0.9764604568481445, "sampling/sampling_logp_difference/mean": 0.014734484255313873, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 144.84375, "completions/mean_terminated_length": 144.84375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.21607786417007446, "epoch": 1.0674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.09943714136351682, "kl": 0.09778988361358643, "learning_rate": 8.150752328143513e-07, "loss": 0.001, "num_tokens": 27611090.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7520544528961182, "sampling/importance_sampling_ratio/mean": 0.9992098808288574, "sampling/importance_sampling_ratio/min": 0.4616309702396393, "sampling/sampling_logp_difference/max": 0.7729895114898682, "sampling/sampling_logp_difference/mean": 0.01589813269674778, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 189.90625, "completions/mean_terminated_length": 189.90625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.30563634634017944, "epoch": 1.0686274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 2.1816561649436643, "kl": 0.13414150476455688, "learning_rate": 8.145217698958211e-07, "loss": 0.0097, "num_tokens": 27640380.0, "reward": -0.28125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004225969314575, "sampling/importance_sampling_ratio/min": 0.46316248178482056, "sampling/sampling_logp_difference/max": 1.0734158754348755, "sampling/sampling_logp_difference/mean": 0.01720433682203293, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 173.71875, "completions/mean_terminated_length": 173.71875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2072143256664276, "epoch": 1.0698529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.08247576173754775, "kl": 0.07004286348819733, "learning_rate": 8.139676685180236e-07, "loss": 0.0007, "num_tokens": 27668090.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7890275716781616, "sampling/importance_sampling_ratio/mean": 0.9991549849510193, "sampling/importance_sampling_ratio/min": 0.3797149658203125, "sampling/sampling_logp_difference/max": 0.968334436416626, "sampling/sampling_logp_difference/mean": 0.016450336202979088, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.18553629517555237, "epoch": 1.071078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.5123373003069194, "kl": 0.07597959041595459, "learning_rate": 8.134129298057495e-07, "loss": 0.0157, "num_tokens": 27696906.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9365143775939941, "sampling/importance_sampling_ratio/mean": 0.9994390606880188, "sampling/importance_sampling_ratio/min": 0.3349955677986145, "sampling/sampling_logp_difference/max": 1.0936380624771118, "sampling/sampling_logp_difference/mean": 0.014272423461079597, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.21442510187625885, "epoch": 1.0723039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.1909442243640793, "kl": 0.09609478712081909, "learning_rate": 8.128575548850832e-07, "loss": 0.0047, "num_tokens": 27721922.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4697335958480835, "sampling/importance_sampling_ratio/mean": 0.9998018741607666, "sampling/importance_sampling_ratio/min": 0.5075057744979858, "sampling/sampling_logp_difference/max": 0.6782472133636475, "sampling/sampling_logp_difference/mean": 0.012937607243657112, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2670651078224182, "epoch": 1.0735294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.465152893859862, "kl": 0.11509145051240921, "learning_rate": 8.123015448834005e-07, "loss": 0.0165, "num_tokens": 27751970.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000338554382324, "sampling/importance_sampling_ratio/min": 0.2617838978767395, "sampling/sampling_logp_difference/max": 1.340235948562622, "sampling/sampling_logp_difference/mean": 0.016946731135249138, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2190035581588745, "epoch": 1.0747549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.7814630699701175, "kl": 0.08700970560312271, "learning_rate": 8.117449009293668e-07, "loss": -0.0111, "num_tokens": 27777058.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00069260597229, "sampling/importance_sampling_ratio/min": 0.5145741701126099, "sampling/sampling_logp_difference/max": 0.8772447109222412, "sampling/sampling_logp_difference/mean": 0.01563248410820961, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 167.140625, "completions/mean_terminated_length": 167.140625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.21881946921348572, "epoch": 1.0759803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.09202604581234461, "kl": 0.09001221507787704, "learning_rate": 8.111876241529337e-07, "loss": 0.0009, "num_tokens": 27805371.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.655460000038147, "sampling/importance_sampling_ratio/mean": 0.9996316432952881, "sampling/importance_sampling_ratio/min": 0.5658692121505737, "sampling/sampling_logp_difference/max": 0.569392204284668, "sampling/sampling_logp_difference/mean": 0.014748353511095047, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 185.453125, "completions/mean_terminated_length": 185.453125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2720149755477905, "epoch": 1.0772058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 2.0505209479500186, "kl": 0.09834681451320648, "learning_rate": 8.106297156853379e-07, "loss": 0.0001, "num_tokens": 27832936.0, "reward": 0.15625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000269412994385, "sampling/importance_sampling_ratio/min": 0.6116071343421936, "sampling/sampling_logp_difference/max": 0.7369050979614258, "sampling/sampling_logp_difference/mean": 0.015585452318191528, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 170.46875, "completions/mean_terminated_length": 170.46875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.20675668120384216, "epoch": 1.0784313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.1771199407375331, "kl": 0.08929380774497986, "learning_rate": 8.100711766590982e-07, "loss": 0.0009, "num_tokens": 27862198.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999538779258728, "sampling/importance_sampling_ratio/min": 0.03980950638651848, "sampling/sampling_logp_difference/max": 3.223649501800537, "sampling/sampling_logp_difference/mean": 0.015134681947529316, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 146.078125, "completions/mean_terminated_length": 146.078125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.18133658170700073, "epoch": 1.079656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.05583044053764275, "kl": 0.07593855261802673, "learning_rate": 8.095120082080134e-07, "loss": 0.0008, "num_tokens": 27887195.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9179813861846924, "sampling/importance_sampling_ratio/mean": 0.9999933242797852, "sampling/importance_sampling_ratio/min": 0.498348593711853, "sampling/sampling_logp_difference/max": 0.6964554786682129, "sampling/sampling_logp_difference/mean": 0.013764860108494759, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 164.5625, "completions/mean_terminated_length": 164.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.24878941476345062, "epoch": 1.0808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.09908678307571109, "kl": 0.08171862363815308, "learning_rate": 8.089522114671602e-07, "loss": 0.0008, "num_tokens": 27919455.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7992045879364014, "sampling/importance_sampling_ratio/mean": 0.9997904300689697, "sampling/importance_sampling_ratio/min": 0.18465721607208252, "sampling/sampling_logp_difference/max": 1.6892540454864502, "sampling/sampling_logp_difference/mean": 0.017147742211818695, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 166.234375, "completions/mean_terminated_length": 166.234375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.20635822415351868, "epoch": 1.0821078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0998980624978608, "kl": 0.058448854833841324, "learning_rate": 8.083917875728905e-07, "loss": 0.0006, "num_tokens": 27949966.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00088632106781, "sampling/importance_sampling_ratio/min": 0.42102333903312683, "sampling/sampling_logp_difference/max": 0.8650670051574707, "sampling/sampling_logp_difference/mean": 0.014977429062128067, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 193.109375, "completions/mean_terminated_length": 193.109375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2436329424381256, "epoch": 1.0833333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 1.1655872943824683, "kl": 0.177456796169281, "learning_rate": 8.07830737662829e-07, "loss": -0.0015, "num_tokens": 27981413.0, "reward": -0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6812682151794434, "sampling/importance_sampling_ratio/mean": 1.0005836486816406, "sampling/importance_sampling_ratio/min": 0.5926125645637512, "sampling/sampling_logp_difference/max": 0.5232144594192505, "sampling/sampling_logp_difference/mean": 0.014726238325238228, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 125.90625, "completions/mean_terminated_length": 125.90625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.18380919098854065, "epoch": 1.0845588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.08432329577655735, "kl": 0.0646153911948204, "learning_rate": 8.072690628758721e-07, "loss": 0.0006, "num_tokens": 28007359.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992244243621826, "sampling/importance_sampling_ratio/min": 0.5326099395751953, "sampling/sampling_logp_difference/max": 1.0724267959594727, "sampling/sampling_logp_difference/mean": 0.014375309459865093, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 173.046875, "completions/mean_terminated_length": 173.046875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22392170131206512, "epoch": 1.0857843137254901, "frac_reward_zero_std": 0.75, "grad_norm": 1.3293725544143347, "kl": 0.07281274348497391, "learning_rate": 8.067067643521833e-07, "loss": -0.0065, "num_tokens": 28035122.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6496044397354126, "sampling/importance_sampling_ratio/mean": 0.9999271035194397, "sampling/importance_sampling_ratio/min": 0.5685496926307678, "sampling/sampling_logp_difference/max": 0.5646665692329407, "sampling/sampling_logp_difference/mean": 0.014200421050190926, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.25153499841690063, "epoch": 1.0870098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.06734149153866195, "kl": 0.06395354866981506, "learning_rate": 8.061438432331934e-07, "loss": 0.0006, "num_tokens": 28068754.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001474618911743, "sampling/importance_sampling_ratio/min": 0.416825532913208, "sampling/sampling_logp_difference/max": 1.0578886270523071, "sampling/sampling_logp_difference/mean": 0.015244759619235992, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 198.59375, "completions/mean_terminated_length": 198.59375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.23634985089302063, "epoch": 1.088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.4979627460277762, "kl": 0.10542238503694534, "learning_rate": 8.055803006615965e-07, "loss": 0.004, "num_tokens": 28096424.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6498923301696777, "sampling/importance_sampling_ratio/mean": 1.0002491474151611, "sampling/importance_sampling_ratio/min": 0.07516255229711533, "sampling/sampling_logp_difference/max": 2.588102102279663, "sampling/sampling_logp_difference/mean": 0.013747122138738632, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 137.328125, "completions/mean_terminated_length": 137.328125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1915649175643921, "epoch": 1.0894607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.35661133762129155, "kl": 0.08230677992105484, "learning_rate": 8.050161377813485e-07, "loss": 0.0008, "num_tokens": 28123997.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6965253353118896, "sampling/importance_sampling_ratio/mean": 1.0004847049713135, "sampling/importance_sampling_ratio/min": 0.5760728120803833, "sampling/sampling_logp_difference/max": 0.5515211820602417, "sampling/sampling_logp_difference/mean": 0.013545336201786995, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 132.84375, "completions/mean_terminated_length": 132.84375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2161245346069336, "epoch": 1.0906862745098038, "frac_reward_zero_std": 1.0, "grad_norm": 0.20850140464210487, "kl": 0.11967165768146515, "learning_rate": 8.04451355737664e-07, "loss": 0.0012, "num_tokens": 28148515.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000712871551514, "sampling/importance_sampling_ratio/min": 0.26496079564094543, "sampling/sampling_logp_difference/max": 1.3281733989715576, "sampling/sampling_logp_difference/mean": 0.013996437191963196, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 151.109375, "completions/mean_terminated_length": 151.109375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2761315107345581, "epoch": 1.0919117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.6413452606663361, "kl": 0.0643676295876503, "learning_rate": 8.03885955677015e-07, "loss": 0.0055, "num_tokens": 28180666.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7586278915405273, "sampling/importance_sampling_ratio/mean": 0.9988889694213867, "sampling/importance_sampling_ratio/min": 0.4414573311805725, "sampling/sampling_logp_difference/max": 0.817673921585083, "sampling/sampling_logp_difference/mean": 0.018916862085461617, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.22008417546749115, "epoch": 1.093137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.3192490633186968, "kl": 0.09187553077936172, "learning_rate": 8.033199387471276e-07, "loss": 0.0106, "num_tokens": 28221972.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006036758422852, "sampling/importance_sampling_ratio/min": 0.4824557304382324, "sampling/sampling_logp_difference/max": 0.7369165420532227, "sampling/sampling_logp_difference/mean": 0.015000857412815094, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 202.53125, "completions/mean_terminated_length": 202.53125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.19605609774589539, "epoch": 1.094362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 3.609952827164341, "kl": 0.06165199726819992, "learning_rate": 8.027533060969806e-07, "loss": 0.0367, "num_tokens": 28255702.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003710985183716, "sampling/importance_sampling_ratio/min": 0.32678836584091187, "sampling/sampling_logp_difference/max": 1.1184425354003906, "sampling/sampling_logp_difference/mean": 0.013808038085699081, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 175.234375, "completions/mean_terminated_length": 175.234375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.21981266140937805, "epoch": 1.0955882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.14068715017450223, "kl": 0.09595729410648346, "learning_rate": 8.021860588768021e-07, "loss": 0.0009, "num_tokens": 28281957.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7782487869262695, "sampling/importance_sampling_ratio/mean": 1.0002150535583496, "sampling/importance_sampling_ratio/min": 0.567199170589447, "sampling/sampling_logp_difference/max": 0.5756289958953857, "sampling/sampling_logp_difference/mean": 0.014246910810470581, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 189.3125, "completions/mean_terminated_length": 189.3125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.268449068069458, "epoch": 1.0968137254901962, "frac_reward_zero_std": 0.5, "grad_norm": 1.6400814076191048, "kl": 0.09315968304872513, "learning_rate": 8.016181982380681e-07, "loss": -0.0292, "num_tokens": 28311689.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.9858661890029907, "sampling/importance_sampling_ratio/mean": 1.0002015829086304, "sampling/importance_sampling_ratio/min": 0.5013279318809509, "sampling/sampling_logp_difference/max": 0.6904948353767395, "sampling/sampling_logp_difference/mean": 0.016102006658911705, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 113.84375, "completions/mean_terminated_length": 113.84375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.15809348225593567, "epoch": 1.0980392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.09564743092595664, "kl": 0.07759271562099457, "learning_rate": 8.010497253335e-07, "loss": 0.0008, "num_tokens": 28334575.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.578310489654541, "sampling/importance_sampling_ratio/mean": 0.9996614456176758, "sampling/importance_sampling_ratio/min": 0.551412045955658, "sampling/sampling_logp_difference/max": 0.5952730178833008, "sampling/sampling_logp_difference/mean": 0.013403412885963917, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 172.15625, "completions/mean_terminated_length": 172.15625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2104586362838745, "epoch": 1.099264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.08695914435531384, "kl": 0.09293670952320099, "learning_rate": 8.004806413170612e-07, "loss": 0.0008, "num_tokens": 28362585.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001728534698486, "sampling/importance_sampling_ratio/min": 0.4738742709159851, "sampling/sampling_logp_difference/max": 0.7468132972717285, "sampling/sampling_logp_difference/mean": 0.013741472736001015, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.279516339302063, "epoch": 1.1004901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.1899765711186496, "kl": 0.10613664984703064, "learning_rate": 7.999109473439569e-07, "loss": -0.0026, "num_tokens": 28392073.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6462812423706055, "sampling/importance_sampling_ratio/mean": 0.9996769428253174, "sampling/importance_sampling_ratio/min": 0.25184938311576843, "sampling/sampling_logp_difference/max": 1.3789241313934326, "sampling/sampling_logp_difference/mean": 0.016572657972574234, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 146.484375, "completions/mean_terminated_length": 146.484375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.17928224802017212, "epoch": 1.1017156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 1.6791457481155367, "kl": 0.08268716931343079, "learning_rate": 7.993406445706292e-07, "loss": 0.0308, "num_tokens": 28420568.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997105598449707, "sampling/importance_sampling_ratio/min": 0.3454512059688568, "sampling/sampling_logp_difference/max": 1.062903881072998, "sampling/sampling_logp_difference/mean": 0.013035709038376808, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 142.625, "completions/mean_terminated_length": 142.625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.21753069758415222, "epoch": 1.1029411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.8405227516058766, "kl": 0.061972249299287796, "learning_rate": 7.987697341547568e-07, "loss": 0.0221, "num_tokens": 28443344.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6958839893341064, "sampling/importance_sampling_ratio/mean": 1.0006866455078125, "sampling/importance_sampling_ratio/min": 0.6069773435592651, "sampling/sampling_logp_difference/max": 0.528204083442688, "sampling/sampling_logp_difference/mean": 0.014490251429378986, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 167.203125, "completions/mean_terminated_length": 167.203125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.23172199726104736, "epoch": 1.1041666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 1.084201416307403, "kl": 0.11401450634002686, "learning_rate": 7.981982172552517e-07, "loss": -0.0055, "num_tokens": 28472525.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6166857481002808, "sampling/importance_sampling_ratio/mean": 0.9994915127754211, "sampling/importance_sampling_ratio/min": 0.47824835777282715, "sampling/sampling_logp_difference/max": 0.7376251220703125, "sampling/sampling_logp_difference/mean": 0.015587793663144112, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 142.96875, "completions/mean_terminated_length": 142.96875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.14093346893787384, "epoch": 1.1053921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.0901951837919706, "kl": 0.06298256665468216, "learning_rate": 7.976260950322571e-07, "loss": -0.018, "num_tokens": 28496107.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0012966394424438, "sampling/importance_sampling_ratio/min": 0.28800708055496216, "sampling/sampling_logp_difference/max": 1.2447702884674072, "sampling/sampling_logp_difference/mean": 0.013270031660795212, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.22646483778953552, "epoch": 1.1066176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 1.3118736590667104, "kl": 0.061849504709243774, "learning_rate": 7.970533686471448e-07, "loss": -0.0118, "num_tokens": 28532011.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000143051147461, "sampling/importance_sampling_ratio/min": 0.4926973581314087, "sampling/sampling_logp_difference/max": 0.9254157543182373, "sampling/sampling_logp_difference/mean": 0.014329024590551853, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 173.78125, "completions/mean_terminated_length": 173.78125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.18403930962085724, "epoch": 1.107843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.08850598821881436, "kl": 0.061722591519355774, "learning_rate": 7.964800392625128e-07, "loss": 0.0006, "num_tokens": 28561373.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.527761459350586, "sampling/importance_sampling_ratio/mean": 0.9997848272323608, "sampling/importance_sampling_ratio/min": 0.2004726231098175, "sampling/sampling_logp_difference/max": 1.6070775985717773, "sampling/sampling_logp_difference/mean": 0.013661077246069908, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 166.78125, "completions/mean_terminated_length": 166.78125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.17432111501693726, "epoch": 1.1090686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.07721996234452343, "kl": 0.07478277385234833, "learning_rate": 7.959061080421838e-07, "loss": 0.0007, "num_tokens": 28590415.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009045600891113, "sampling/importance_sampling_ratio/min": 0.5906212329864502, "sampling/sampling_logp_difference/max": 0.9104375839233398, "sampling/sampling_logp_difference/mean": 0.013019047677516937, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 149.546875, "completions/mean_terminated_length": 149.546875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.2405448853969574, "epoch": 1.1102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.11768022147162954, "kl": 0.09011099487543106, "learning_rate": 7.953315761512017e-07, "loss": 0.0008, "num_tokens": 28616482.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6234381198883057, "sampling/importance_sampling_ratio/mean": 0.999937117099762, "sampling/importance_sampling_ratio/min": 0.5206286907196045, "sampling/sampling_logp_difference/max": 0.652718186378479, "sampling/sampling_logp_difference/mean": 0.01653948426246643, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 175.625, "completions/mean_terminated_length": 175.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24027252197265625, "epoch": 1.1115196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.048770372300378353, "kl": 0.09848096966743469, "learning_rate": 7.947564447558299e-07, "loss": 0.0009, "num_tokens": 28643450.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5679230690002441, "sampling/importance_sampling_ratio/mean": 0.9994180202484131, "sampling/importance_sampling_ratio/min": 0.3743257522583008, "sampling/sampling_logp_difference/max": 0.9826288223266602, "sampling/sampling_logp_difference/mean": 0.014152075164020061, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 190.59375, "completions/mean_terminated_length": 190.59375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.21170996129512787, "epoch": 1.1127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.057692996553791565, "kl": 0.06437669694423676, "learning_rate": 7.941807150235485e-07, "loss": 0.0006, "num_tokens": 28676832.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.697697401046753, "sampling/importance_sampling_ratio/mean": 0.9997852444648743, "sampling/importance_sampling_ratio/min": 0.5717601180076599, "sampling/sampling_logp_difference/max": 0.5590357780456543, "sampling/sampling_logp_difference/mean": 0.01418859139084816, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 214.28125, "completions/mean_terminated_length": 214.28125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.23234930634498596, "epoch": 1.1139705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.055265118978672645, "kl": 0.08269356191158295, "learning_rate": 7.936043881230525e-07, "loss": 0.0008, "num_tokens": 28709346.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997777938842773, "sampling/importance_sampling_ratio/min": 0.447732537984848, "sampling/sampling_logp_difference/max": 1.0789175033569336, "sampling/sampling_logp_difference/mean": 0.0158439502120018, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3130796253681183, "epoch": 1.1151960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.400752969110404, "kl": 0.11039891093969345, "learning_rate": 7.930274652242491e-07, "loss": 0.0088, "num_tokens": 28736522.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6554890871047974, "sampling/importance_sampling_ratio/mean": 0.9998528361320496, "sampling/importance_sampling_ratio/min": 0.6171384453773499, "sampling/sampling_logp_difference/max": 0.504096508026123, "sampling/sampling_logp_difference/mean": 0.01720406860113144, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 206.96875, "completions/mean_terminated_length": 206.96875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.24904829263687134, "epoch": 1.116421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.666155549267269, "kl": 0.0964755266904831, "learning_rate": 7.924499474982551e-07, "loss": 0.011, "num_tokens": 28774504.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8129268884658813, "sampling/importance_sampling_ratio/mean": 0.9995471239089966, "sampling/importance_sampling_ratio/min": 0.2583095133304596, "sampling/sampling_logp_difference/max": 1.3535966873168945, "sampling/sampling_logp_difference/mean": 0.01603316329419613, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.25998538732528687, "epoch": 1.1176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.328124972758362, "kl": 0.08714132755994797, "learning_rate": 7.91871836117395e-07, "loss": 0.0159, "num_tokens": 28800528.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000813007354736, "sampling/importance_sampling_ratio/min": 0.52607262134552, "sampling/sampling_logp_difference/max": 0.9863278865814209, "sampling/sampling_logp_difference/mean": 0.01717241108417511, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 186.59375, "completions/mean_terminated_length": 186.59375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2034502923488617, "epoch": 1.1188725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.04940559681708589, "kl": 0.07114953547716141, "learning_rate": 7.91293132255198e-07, "loss": 0.0007, "num_tokens": 28833446.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.645533800125122, "sampling/importance_sampling_ratio/mean": 0.9995165467262268, "sampling/importance_sampling_ratio/min": 0.6127004623413086, "sampling/sampling_logp_difference/max": 0.4980647563934326, "sampling/sampling_logp_difference/mean": 0.012644816190004349, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 190.296875, "completions/mean_terminated_length": 190.296875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.24596183001995087, "epoch": 1.1200980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.5655704185905628, "kl": 0.06891372799873352, "learning_rate": 7.907138370863967e-07, "loss": -0.0189, "num_tokens": 28863177.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6278371810913086, "sampling/importance_sampling_ratio/mean": 0.9996863007545471, "sampling/importance_sampling_ratio/min": 0.4309251606464386, "sampling/sampling_logp_difference/max": 0.8418208360671997, "sampling/sampling_logp_difference/mean": 0.014467434026300907, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 213.203125, "completions/mean_terminated_length": 213.203125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.1680026650428772, "epoch": 1.1213235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.107844507143739, "kl": 0.062312401831150055, "learning_rate": 7.901339517869232e-07, "loss": -0.0027, "num_tokens": 28897062.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 0.9992839694023132, "sampling/importance_sampling_ratio/min": 0.1829947978258133, "sampling/sampling_logp_difference/max": 1.6982975006103516, "sampling/sampling_logp_difference/mean": 0.01189229916781187, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 124.671875, "completions/mean_terminated_length": 124.671875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.20145872235298157, "epoch": 1.1225490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0697433020845855, "kl": 0.07393026351928711, "learning_rate": 7.895534775339083e-07, "loss": 0.0008, "num_tokens": 28925697.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6738722324371338, "sampling/importance_sampling_ratio/mean": 0.9997876286506653, "sampling/importance_sampling_ratio/min": 0.42287662625312805, "sampling/sampling_logp_difference/max": 0.8606748580932617, "sampling/sampling_logp_difference/mean": 0.014831296168267727, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 188.28125, "completions/mean_terminated_length": 188.28125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2790355980396271, "epoch": 1.1237745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 1.6326954238160873, "kl": 0.09928452968597412, "learning_rate": 7.889724155056776e-07, "loss": -0.0389, "num_tokens": 28965555.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000627040863037, "sampling/importance_sampling_ratio/min": 0.29445353150367737, "sampling/sampling_logp_difference/max": 1.2226340770721436, "sampling/sampling_logp_difference/mean": 0.017847351729869843, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 236.8125, "completions/mean_terminated_length": 236.8125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2594095766544342, "epoch": 1.125, "frac_reward_zero_std": 0.75, "grad_norm": 1.361891349827105, "kl": 0.090843565762043, "learning_rate": 7.883907668817506e-07, "loss": -0.004, "num_tokens": 29000967.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6158757209777832, "sampling/importance_sampling_ratio/mean": 1.0001858472824097, "sampling/importance_sampling_ratio/min": 0.5726673603057861, "sampling/sampling_logp_difference/max": 0.5574502944946289, "sampling/sampling_logp_difference/mean": 0.01475977711379528, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 263.03125, "completions/mean_terminated_length": 263.03125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.27755606174468994, "epoch": 1.1262254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 1.5527147258963039, "kl": 0.10148150473833084, "learning_rate": 7.878085328428368e-07, "loss": -0.0231, "num_tokens": 29034137.0, "reward": 0.21875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000124216079712, "sampling/importance_sampling_ratio/min": 0.4954375624656677, "sampling/sampling_logp_difference/max": 0.7390586137771606, "sampling/sampling_logp_difference/mean": 0.014863025397062302, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 229.78125, "completions/mean_terminated_length": 229.78125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2748275399208069, "epoch": 1.1274509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 1.8659229034113305, "kl": 0.06278257071971893, "learning_rate": 7.872257145708345e-07, "loss": 0.0131, "num_tokens": 29070443.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5545119047164917, "sampling/importance_sampling_ratio/mean": 0.9995891451835632, "sampling/importance_sampling_ratio/min": 0.3962823748588562, "sampling/sampling_logp_difference/max": 0.9256283044815063, "sampling/sampling_logp_difference/mean": 0.016074594110250473, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 249.0625, "completions/mean_terminated_length": 249.0625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.1775428056716919, "epoch": 1.1286764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.2413018919858242, "kl": 0.06724140048027039, "learning_rate": 7.86642313248828e-07, "loss": 0.0006, "num_tokens": 29102079.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8754522800445557, "sampling/importance_sampling_ratio/mean": 0.9997156262397766, "sampling/importance_sampling_ratio/min": 0.3408900499343872, "sampling/sampling_logp_difference/max": 1.076195240020752, "sampling/sampling_logp_difference/mean": 0.012227769941091537, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3411758840084076, "epoch": 1.1299019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.6798456484998654, "kl": 0.10103578865528107, "learning_rate": 7.860583300610847e-07, "loss": -0.0457, "num_tokens": 29143757.0, "reward": 0.34375, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6482800245285034, "sampling/importance_sampling_ratio/mean": 1.0002042055130005, "sampling/importance_sampling_ratio/min": 0.6066067218780518, "sampling/sampling_logp_difference/max": 0.4998745918273926, "sampling/sampling_logp_difference/mean": 0.017368076369166374, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2883768081665039, "epoch": 1.1311274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.04460912249120463, "kl": 0.06478792428970337, "learning_rate": 7.854737661930539e-07, "loss": 0.0006, "num_tokens": 29173413.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7809442281723022, "sampling/importance_sampling_ratio/mean": 0.9995173811912537, "sampling/importance_sampling_ratio/min": 0.2171446681022644, "sampling/sampling_logp_difference/max": 1.5271915197372437, "sampling/sampling_logp_difference/mean": 0.015471132472157478, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 242.90625, "completions/mean_terminated_length": 242.90625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2667590379714966, "epoch": 1.1323529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.4411946643889548, "kl": 0.09007437527179718, "learning_rate": 7.848886228313632e-07, "loss": 0.0905, "num_tokens": 29208687.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6544312238693237, "sampling/importance_sampling_ratio/mean": 0.9998416304588318, "sampling/importance_sampling_ratio/min": 0.5485104322433472, "sampling/sampling_logp_difference/max": 0.6005489826202393, "sampling/sampling_logp_difference/mean": 0.014343110844492912, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 172.328125, "completions/mean_terminated_length": 172.328125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.18077805638313293, "epoch": 1.133578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0403236538211983, "kl": 0.04739869385957718, "learning_rate": 7.843029011638162e-07, "loss": 0.0005, "num_tokens": 29232884.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002453327178955, "sampling/importance_sampling_ratio/min": 0.48107123374938965, "sampling/sampling_logp_difference/max": 0.746837854385376, "sampling/sampling_logp_difference/mean": 0.011145679280161858, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 227.3125, "completions/mean_terminated_length": 227.3125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.22841443121433258, "epoch": 1.1348039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.9636693012874294, "kl": 0.07613380253314972, "learning_rate": 7.837166023793908e-07, "loss": -0.0178, "num_tokens": 29266520.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.75249445438385, "sampling/importance_sampling_ratio/mean": 1.000523328781128, "sampling/importance_sampling_ratio/min": 0.5525768399238586, "sampling/sampling_logp_difference/max": 0.5931627750396729, "sampling/sampling_logp_difference/mean": 0.012801182456314564, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 155.28125, "completions/mean_terminated_length": 155.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.23076486587524414, "epoch": 1.1360294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.8722361970346875, "kl": 0.10799363255500793, "learning_rate": 7.831297276682368e-07, "loss": 0.033, "num_tokens": 29290570.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.8826043605804443, "sampling/importance_sampling_ratio/mean": 1.0002775192260742, "sampling/importance_sampling_ratio/min": 0.4824259579181671, "sampling/sampling_logp_difference/max": 0.7289278507232666, "sampling/sampling_logp_difference/mean": 0.01528744027018547, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 170.953125, "completions/mean_terminated_length": 170.953125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2521914541721344, "epoch": 1.1372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.05808780249728172, "kl": 0.10633736848831177, "learning_rate": 7.825422782216724e-07, "loss": 0.0011, "num_tokens": 29321895.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4849331378936768, "sampling/importance_sampling_ratio/mean": 0.9999880194664001, "sampling/importance_sampling_ratio/min": 0.6073311567306519, "sampling/sampling_logp_difference/max": 0.49868106842041016, "sampling/sampling_logp_difference/mean": 0.014466575346887112, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 190.828125, "completions/mean_terminated_length": 190.828125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23662221431732178, "epoch": 1.1384803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.6294933536697809, "kl": 0.05445215106010437, "learning_rate": 7.819542552321827e-07, "loss": 0.0043, "num_tokens": 29349644.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4760836362838745, "sampling/importance_sampling_ratio/mean": 0.9997459650039673, "sampling/importance_sampling_ratio/min": 0.5910096168518066, "sampling/sampling_logp_difference/max": 0.525922954082489, "sampling/sampling_logp_difference/mean": 0.013352379202842712, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 202.984375, "completions/mean_terminated_length": 202.984375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.27404844760894775, "epoch": 1.1397058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.2645598118345176, "kl": 0.07706104218959808, "learning_rate": 7.813656598934173e-07, "loss": -0.0056, "num_tokens": 29379915.0, "reward": -0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998271465301514, "sampling/importance_sampling_ratio/min": 0.5051249265670776, "sampling/sampling_logp_difference/max": 0.7048373222351074, "sampling/sampling_logp_difference/mean": 0.0161677747964859, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 269.28125, "completions/mean_terminated_length": 269.28125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.26741868257522583, "epoch": 1.1409313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.9393949610844635, "kl": 0.07544878125190735, "learning_rate": 7.807764934001874e-07, "loss": -0.0045, "num_tokens": 29413757.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6231114864349365, "sampling/importance_sampling_ratio/mean": 0.9996011257171631, "sampling/importance_sampling_ratio/min": 0.23029865324497223, "sampling/sampling_logp_difference/max": 1.4683783054351807, "sampling/sampling_logp_difference/mean": 0.015428060665726662, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 197.71875, "completions/mean_terminated_length": 197.71875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.24409207701683044, "epoch": 1.142156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.07716562395080737, "kl": 0.09720595180988312, "learning_rate": 7.801867569484634e-07, "loss": 0.0009, "num_tokens": 29448891.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.964284062385559, "sampling/importance_sampling_ratio/mean": 0.9992693662643433, "sampling/importance_sampling_ratio/min": 0.3163638114929199, "sampling/sampling_logp_difference/max": 1.150862455368042, "sampling/sampling_logp_difference/mean": 0.01469617709517479, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26554036140441895, "epoch": 1.1433823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 1.5312344576887653, "kl": 0.09911169111728668, "learning_rate": 7.795964517353733e-07, "loss": 0.0153, "num_tokens": 29475595.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6276973485946655, "sampling/importance_sampling_ratio/mean": 0.9995471239089966, "sampling/importance_sampling_ratio/min": 0.4482499063014984, "sampling/sampling_logp_difference/max": 0.8024044036865234, "sampling/sampling_logp_difference/mean": 0.014082803390920162, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 170.296875, "completions/mean_terminated_length": 170.296875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.23128408193588257, "epoch": 1.1446078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 1.611400382252435, "kl": 0.09448525309562683, "learning_rate": 7.790055789591993e-07, "loss": 0.0169, "num_tokens": 29502830.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6554914712905884, "sampling/importance_sampling_ratio/mean": 1.000464677810669, "sampling/importance_sampling_ratio/min": 0.5213980674743652, "sampling/sampling_logp_difference/max": 0.6512415409088135, "sampling/sampling_logp_difference/mean": 0.014646067284047604, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 164.640625, "completions/mean_terminated_length": 164.640625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.19989417493343353, "epoch": 1.1458333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.043187364235219225, "kl": 0.06290332973003387, "learning_rate": 7.784141398193753e-07, "loss": 0.0006, "num_tokens": 29537687.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6942651271820068, "sampling/importance_sampling_ratio/mean": 0.9995973706245422, "sampling/importance_sampling_ratio/min": 0.6011142730712891, "sampling/sampling_logp_difference/max": 0.5272490978240967, "sampling/sampling_logp_difference/mean": 0.012707795947790146, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 238.640625, "completions/mean_terminated_length": 238.640625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.270319402217865, "epoch": 1.1470588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 2.0861291354862272, "kl": 0.11952750384807587, "learning_rate": 7.778221355164857e-07, "loss": -0.0327, "num_tokens": 29579984.0, "reward": 0.21875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.868499755859375, "sampling/importance_sampling_ratio/mean": 1.0005584955215454, "sampling/importance_sampling_ratio/min": 0.48394113779067993, "sampling/sampling_logp_difference/max": 0.7257919311523438, "sampling/sampling_logp_difference/mean": 0.015709731727838516, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 196.109375, "completions/mean_terminated_length": 196.109375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.21992076933383942, "epoch": 1.1482843137254901, "frac_reward_zero_std": 0.5, "grad_norm": 1.5846443807671733, "kl": 0.0719984918832779, "learning_rate": 7.772295672522614e-07, "loss": -0.0043, "num_tokens": 29611431.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.987679362297058, "sampling/importance_sampling_ratio/mean": 0.9999401569366455, "sampling/importance_sampling_ratio/min": 0.26966604590415955, "sampling/sampling_logp_difference/max": 1.3105709552764893, "sampling/sampling_logp_difference/mean": 0.013067006133496761, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 159.34375, "completions/mean_terminated_length": 159.34375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2844783365726471, "epoch": 1.1495098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 2.106287045051587, "kl": 0.14609958231449127, "learning_rate": 7.766364362295788e-07, "loss": -0.0294, "num_tokens": 29640813.0, "reward": -0.21875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6640719175338745, "sampling/importance_sampling_ratio/mean": 0.9998704791069031, "sampling/importance_sampling_ratio/min": 0.54460608959198, "sampling/sampling_logp_difference/max": 0.6076924800872803, "sampling/sampling_logp_difference/mean": 0.017406413331627846, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 184.140625, "completions/mean_terminated_length": 184.140625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3216135799884796, "epoch": 1.150735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.7505349015734255, "kl": 0.14020898938179016, "learning_rate": 7.760427436524559e-07, "loss": -0.0575, "num_tokens": 29671094.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6192717552185059, "sampling/importance_sampling_ratio/mean": 1.0005667209625244, "sampling/importance_sampling_ratio/min": 0.6146707534790039, "sampling/sampling_logp_difference/max": 0.48666858673095703, "sampling/sampling_logp_difference/mean": 0.01606844738125801, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 183.328125, "completions/mean_terminated_length": 183.328125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2882741689682007, "epoch": 1.1519607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.2742192952923719, "kl": 0.11009672284126282, "learning_rate": 7.754484907260512e-07, "loss": -0.0043, "num_tokens": 29700539.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9129751920700073, "sampling/importance_sampling_ratio/mean": 0.9998451471328735, "sampling/importance_sampling_ratio/min": 0.5417364239692688, "sampling/sampling_logp_difference/max": 0.6486597061157227, "sampling/sampling_logp_difference/mean": 0.016127295792102814, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 235.953125, "completions/mean_terminated_length": 235.953125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2792004346847534, "epoch": 1.153186274509804, "frac_reward_zero_std": 0.25, "grad_norm": 1.7810826416466288, "kl": 0.09145475178956985, "learning_rate": 7.748536786566606e-07, "loss": 0.0448, "num_tokens": 29734712.0, "reward": 0.46875, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8409394025802612, "sampling/importance_sampling_ratio/mean": 0.9997409582138062, "sampling/importance_sampling_ratio/min": 0.32505807280540466, "sampling/sampling_logp_difference/max": 1.1237514019012451, "sampling/sampling_logp_difference/mean": 0.014812508597970009, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 165.6875, "completions/mean_terminated_length": 165.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.31966108083724976, "epoch": 1.1544117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 2.281936858636925, "kl": 0.1147167757153511, "learning_rate": 7.742583086517149e-07, "loss": 0.0217, "num_tokens": 29767604.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5733104944229126, "sampling/importance_sampling_ratio/mean": 1.0004137754440308, "sampling/importance_sampling_ratio/min": 0.5986762642860413, "sampling/sampling_logp_difference/max": 0.5130343437194824, "sampling/sampling_logp_difference/mean": 0.015881778672337532, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 239.015625, "completions/mean_terminated_length": 239.015625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24411898851394653, "epoch": 1.155637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.1515480831371738, "kl": 0.06654952466487885, "learning_rate": 7.736623819197773e-07, "loss": 0.0136, "num_tokens": 29800933.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0002235174179077, "sampling/importance_sampling_ratio/min": 0.49754151701927185, "sampling/sampling_logp_difference/max": 0.6980762481689453, "sampling/sampling_logp_difference/mean": 0.014166567474603653, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 193.046875, "completions/mean_terminated_length": 193.046875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2028464674949646, "epoch": 1.156862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.15238905596366073, "kl": 0.07373914867639542, "learning_rate": 7.730658996705415e-07, "loss": 0.0007, "num_tokens": 29832808.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999911785125732, "sampling/importance_sampling_ratio/min": 0.4362257421016693, "sampling/sampling_logp_difference/max": 1.1146588325500488, "sampling/sampling_logp_difference/mean": 0.013756453059613705, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 147.140625, "completions/mean_terminated_length": 147.140625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.24343600869178772, "epoch": 1.1580882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640526979620608, "kl": 0.12458021193742752, "learning_rate": 7.724688631148286e-07, "loss": 0.0012, "num_tokens": 29860337.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 0.9997891187667847, "sampling/importance_sampling_ratio/min": 0.48723921179771423, "sampling/sampling_logp_difference/max": 0.7190001010894775, "sampling/sampling_logp_difference/mean": 0.0150705361738801, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.24111589789390564, "epoch": 1.159313725490196, "frac_reward_zero_std": 0.25, "grad_norm": 2.088480544947843, "kl": 0.09656170010566711, "learning_rate": 7.718712734645849e-07, "loss": 0.0145, "num_tokens": 29891225.0, "reward": 0.5625, "reward_std": 0.622555673122406, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9504865407943726, "sampling/importance_sampling_ratio/mean": 1.0003337860107422, "sampling/importance_sampling_ratio/min": 0.4205363392829895, "sampling/sampling_logp_difference/max": 0.8662244081497192, "sampling/sampling_logp_difference/mean": 0.014271966181695461, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 164.921875, "completions/mean_terminated_length": 164.921875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.23615966737270355, "epoch": 1.1605392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 2.360246726436841, "kl": 0.11994391679763794, "learning_rate": 7.712731319328797e-07, "loss": 0.0086, "num_tokens": 29919316.0, "reward": -0.1875, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.9379637241363525, "sampling/importance_sampling_ratio/mean": 0.9998552799224854, "sampling/importance_sampling_ratio/min": 0.2048644721508026, "sampling/sampling_logp_difference/max": 1.5854065418243408, "sampling/sampling_logp_difference/mean": 0.014262652024626732, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 135.046875, "completions/mean_terminated_length": 135.046875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.1943216323852539, "epoch": 1.161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0982288019009852, "kl": 0.09366685152053833, "learning_rate": 7.706744397339022e-07, "loss": 0.0009, "num_tokens": 29943847.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5468405485153198, "sampling/importance_sampling_ratio/mean": 0.9990905523300171, "sampling/importance_sampling_ratio/min": 0.3681606352329254, "sampling/sampling_logp_difference/max": 0.999235987663269, "sampling/sampling_logp_difference/mean": 0.013385389931499958, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2914361357688904, "epoch": 1.1629901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.592449862986648, "kl": 0.1040593683719635, "learning_rate": 7.700751980829601e-07, "loss": 0.0265, "num_tokens": 29974231.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 1.0004979372024536, "sampling/importance_sampling_ratio/min": 0.6368659138679504, "sampling/sampling_logp_difference/max": 0.4985384941101074, "sampling/sampling_logp_difference/mean": 0.015406950376927853, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 143.359375, "completions/mean_terminated_length": 143.359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.21524740755558014, "epoch": 1.1642156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 1.404595430725208, "kl": 0.09196782857179642, "learning_rate": 7.694754081964754e-07, "loss": 0.0017, "num_tokens": 29998974.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.511399269104004, "sampling/importance_sampling_ratio/mean": 0.9996278285980225, "sampling/importance_sampling_ratio/min": 0.048157621175050735, "sampling/sampling_logp_difference/max": 3.033275842666626, "sampling/sampling_logp_difference/mean": 0.014019029214978218, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 201.84375, "completions/mean_terminated_length": 201.84375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22843119502067566, "epoch": 1.1654411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 2.62753929659269, "kl": 0.08088122308254242, "learning_rate": 7.688750712919839e-07, "loss": 0.1308, "num_tokens": 30033316.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9675623178482056, "sampling/importance_sampling_ratio/mean": 0.9993234872817993, "sampling/importance_sampling_ratio/min": 0.5146050453186035, "sampling/sampling_logp_difference/max": 0.6767954230308533, "sampling/sampling_logp_difference/mean": 0.013346588239073753, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 220.203125, "completions/mean_terminated_length": 220.203125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2933310270309448, "epoch": 1.1666666666666667, "frac_reward_zero_std": 0.25, "grad_norm": 1.8790975865395463, "kl": 0.11788161098957062, "learning_rate": 7.682741885881314e-07, "loss": -0.0952, "num_tokens": 30065745.0, "reward": 0.40625, "reward_std": 0.747555673122406, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7792528867721558, "sampling/importance_sampling_ratio/mean": 1.0003098249435425, "sampling/importance_sampling_ratio/min": 0.10682544112205505, "sampling/sampling_logp_difference/max": 2.2365591526031494, "sampling/sampling_logp_difference/mean": 0.016651522368192673, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.29392799735069275, "epoch": 1.1678921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.3190970582439598, "kl": 0.10862173140048981, "learning_rate": 7.676727613046719e-07, "loss": 0.0063, "num_tokens": 30101649.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9622893333435059, "sampling/importance_sampling_ratio/mean": 0.9999975562095642, "sampling/importance_sampling_ratio/min": 0.3799148201942444, "sampling/sampling_logp_difference/max": 0.9678082466125488, "sampling/sampling_logp_difference/mean": 0.018010687083005905, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 157.4375, "completions/mean_terminated_length": 157.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.23681524395942688, "epoch": 1.1691176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 1.2859018028114504, "kl": 0.11465495824813843, "learning_rate": 7.670707906624643e-07, "loss": 0.0129, "num_tokens": 30126461.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994912147521973, "sampling/importance_sampling_ratio/min": 0.3715115785598755, "sampling/sampling_logp_difference/max": 0.9901752471923828, "sampling/sampling_logp_difference/mean": 0.015134238637983799, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.25739586353302, "epoch": 1.170343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.9051026750783888, "kl": 0.10601422935724258, "learning_rate": 7.664682778834712e-07, "loss": 0.0091, "num_tokens": 30154149.0, "reward": 0.0, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9012205600738525, "sampling/importance_sampling_ratio/mean": 1.0000576972961426, "sampling/importance_sampling_ratio/min": 0.6222960948944092, "sampling/sampling_logp_difference/max": 0.6424961090087891, "sampling/sampling_logp_difference/mean": 0.014406044036149979, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 171.59375, "completions/mean_terminated_length": 171.59375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.26630714535713196, "epoch": 1.1715686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.2159802984754928, "kl": 0.11691936105489731, "learning_rate": 7.658652241907554e-07, "loss": -0.0028, "num_tokens": 30179435.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5972579717636108, "sampling/importance_sampling_ratio/mean": 1.0007264614105225, "sampling/importance_sampling_ratio/min": 0.5100693702697754, "sampling/sampling_logp_difference/max": 0.6732085347175598, "sampling/sampling_logp_difference/mean": 0.01576792448759079, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 181.203125, "completions/mean_terminated_length": 181.203125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2374798059463501, "epoch": 1.1727941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.9398123426729155, "kl": 0.12092936784029007, "learning_rate": 7.652616308084774e-07, "loss": -0.0298, "num_tokens": 30210264.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 0.9997060298919678, "sampling/importance_sampling_ratio/min": 0.6082713603973389, "sampling/sampling_logp_difference/max": 0.49713414907455444, "sampling/sampling_logp_difference/mean": 0.012684464454650879, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 157.703125, "completions/mean_terminated_length": 157.703125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.19216549396514893, "epoch": 1.1740196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 1.1590387512992275, "kl": 0.11101230978965759, "learning_rate": 7.646574989618937e-07, "loss": 0.011, "num_tokens": 30236373.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7858788967132568, "sampling/importance_sampling_ratio/mean": 1.0005167722702026, "sampling/importance_sampling_ratio/min": 0.5496814846992493, "sampling/sampling_logp_difference/max": 0.5984163284301758, "sampling/sampling_logp_difference/mean": 0.012150926515460014, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 169.328125, "completions/mean_terminated_length": 169.328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.22564482688903809, "epoch": 1.1752450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.209704009097848, "kl": 0.09982454031705856, "learning_rate": 7.640528298773536e-07, "loss": -0.0015, "num_tokens": 30263402.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6621389389038086, "sampling/importance_sampling_ratio/mean": 1.0002570152282715, "sampling/importance_sampling_ratio/min": 0.41285261511802673, "sampling/sampling_logp_difference/max": 0.8846646547317505, "sampling/sampling_logp_difference/mean": 0.01376567967236042, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 182.34375, "completions/mean_terminated_length": 182.34375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24452808499336243, "epoch": 1.1764705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.1652783005328748, "kl": 0.12087363749742508, "learning_rate": 7.634476247822972e-07, "loss": -0.0011, "num_tokens": 30291136.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.820923924446106, "sampling/importance_sampling_ratio/mean": 0.9999774694442749, "sampling/importance_sampling_ratio/min": 0.4822184443473816, "sampling/sampling_logp_difference/max": 0.7293580770492554, "sampling/sampling_logp_difference/mean": 0.014854757115244865, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.23493146896362305, "epoch": 1.1776960784313726, "frac_reward_zero_std": 0.25, "grad_norm": 1.715701720905447, "kl": 0.11910027265548706, "learning_rate": 7.628418849052523e-07, "loss": -0.0006, "num_tokens": 30317792.0, "reward": 0.75, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7189116477966309, "sampling/importance_sampling_ratio/mean": 1.0002059936523438, "sampling/importance_sampling_ratio/min": 0.6273089051246643, "sampling/sampling_logp_difference/max": 0.5416913032531738, "sampling/sampling_logp_difference/mean": 0.013392720371484756, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.23814696073532104, "epoch": 1.178921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 2.1236190687946874, "kl": 0.10747385770082474, "learning_rate": 7.622356114758327e-07, "loss": -0.0233, "num_tokens": 30344984.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001795291900635, "sampling/importance_sampling_ratio/min": 0.03683508187532425, "sampling/sampling_logp_difference/max": 3.301304578781128, "sampling/sampling_logp_difference/mean": 0.014577718451619148, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 173.796875, "completions/mean_terminated_length": 173.796875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.25150713324546814, "epoch": 1.1801470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.207080415526954, "kl": 0.10294643044471741, "learning_rate": 7.616288057247349e-07, "loss": -0.0088, "num_tokens": 30375211.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6083003282546997, "sampling/importance_sampling_ratio/mean": 0.9993696212768555, "sampling/importance_sampling_ratio/min": 0.1920984983444214, "sampling/sampling_logp_difference/max": 1.6497470140457153, "sampling/sampling_logp_difference/mean": 0.014811830595135689, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 130.15625, "completions/mean_terminated_length": 130.15625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.27737027406692505, "epoch": 1.1813725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.060223855735233534, "kl": 0.11574649065732956, "learning_rate": 7.610214688837361e-07, "loss": 0.0011, "num_tokens": 30410293.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4464823007583618, "sampling/importance_sampling_ratio/mean": 0.9997907280921936, "sampling/importance_sampling_ratio/min": 0.6293777227401733, "sampling/sampling_logp_difference/max": 0.46302366256713867, "sampling/sampling_logp_difference/mean": 0.014945710077881813, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 147.046875, "completions/mean_terminated_length": 147.046875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.1855459064245224, "epoch": 1.1825980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.048182256152113835, "kl": 0.08147725462913513, "learning_rate": 7.604136021856916e-07, "loss": 0.0008, "num_tokens": 30436168.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007353067398071, "sampling/importance_sampling_ratio/mean": 1.0000842809677124, "sampling/importance_sampling_ratio/min": 0.5712746977806091, "sampling/sampling_logp_difference/max": 0.5598850250244141, "sampling/sampling_logp_difference/mean": 0.012940846383571625, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 131.890625, "completions/mean_terminated_length": 131.890625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.23324932157993317, "epoch": 1.1838235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.4133326252559428, "kl": 0.10719141364097595, "learning_rate": 7.598052068645324e-07, "loss": -0.0014, "num_tokens": 30466737.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6355787515640259, "sampling/importance_sampling_ratio/mean": 1.0004844665527344, "sampling/importance_sampling_ratio/min": 0.614501953125, "sampling/sampling_logp_difference/max": 0.49199676513671875, "sampling/sampling_logp_difference/mean": 0.014336496591567993, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 155.859375, "completions/mean_terminated_length": 155.859375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.21427032351493835, "epoch": 1.1850490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.3469113174737197, "kl": 0.0982939600944519, "learning_rate": 7.591962841552626e-07, "loss": 0.0031, "num_tokens": 30502136.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.563099980354309, "sampling/importance_sampling_ratio/mean": 1.0004643201828003, "sampling/importance_sampling_ratio/min": 0.43885937333106995, "sampling/sampling_logp_difference/max": 0.8235762119293213, "sampling/sampling_logp_difference/mean": 0.014352173544466496, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 181.484375, "completions/mean_terminated_length": 181.484375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.24575388431549072, "epoch": 1.1862745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.1527970986647007, "kl": 0.10954149812459946, "learning_rate": 7.585868352939562e-07, "loss": 0.0126, "num_tokens": 30530631.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7687509059906006, "sampling/importance_sampling_ratio/mean": 1.0001543760299683, "sampling/importance_sampling_ratio/min": 0.6259166598320007, "sampling/sampling_logp_difference/max": 0.5702736377716064, "sampling/sampling_logp_difference/mean": 0.013726416043937206, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 159.578125, "completions/mean_terminated_length": 159.578125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.32892221212387085, "epoch": 1.1875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0645469121857873, "kl": 0.10489454120397568, "learning_rate": 7.579768615177564e-07, "loss": 0.001, "num_tokens": 30557644.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.948359489440918, "sampling/importance_sampling_ratio/mean": 0.9999338388442993, "sampling/importance_sampling_ratio/min": 0.6127480864524841, "sampling/sampling_logp_difference/max": 0.6669877767562866, "sampling/sampling_logp_difference/mean": 0.01787525787949562, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 258.515625, "completions/mean_terminated_length": 258.515625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.31929582357406616, "epoch": 1.1887254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 1.7019363401511123, "kl": 0.10821347683668137, "learning_rate": 7.57366364064871e-07, "loss": 0.0424, "num_tokens": 30594285.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5922117233276367, "sampling/importance_sampling_ratio/mean": 1.0006930828094482, "sampling/importance_sampling_ratio/min": 0.5471285581588745, "sampling/sampling_logp_difference/max": 0.6030714511871338, "sampling/sampling_logp_difference/mean": 0.015439395792782307, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 181.5625, "completions/mean_terminated_length": 181.5625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2947167158126831, "epoch": 1.1899509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 1.3912095770517858, "kl": 0.13718779385089874, "learning_rate": 7.567553441745711e-07, "loss": -0.0096, "num_tokens": 30629057.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5678318738937378, "sampling/importance_sampling_ratio/mean": 0.9998456239700317, "sampling/importance_sampling_ratio/min": 0.5356789231300354, "sampling/sampling_logp_difference/max": 0.6242203712463379, "sampling/sampling_logp_difference/mean": 0.014373021200299263, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 180.140625, "completions/mean_terminated_length": 180.140625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.30969664454460144, "epoch": 1.1911764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 2.7556685828599283, "kl": 0.11412659287452698, "learning_rate": 7.561438030871885e-07, "loss": 0.0002, "num_tokens": 30657306.0, "reward": 0.65625, "reward_std": 0.7015564441680908, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4793007373809814, "sampling/importance_sampling_ratio/mean": 0.9995293617248535, "sampling/importance_sampling_ratio/min": 0.5742098093032837, "sampling/sampling_logp_difference/max": 0.5547604560852051, "sampling/sampling_logp_difference/mean": 0.015855111181735992, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 218.890625, "completions/mean_terminated_length": 218.890625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.33018720149993896, "epoch": 1.1924019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.06753584191478308, "kl": 0.12269055843353271, "learning_rate": 7.555317420441129e-07, "loss": 0.0012, "num_tokens": 30691955.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997884035110474, "sampling/importance_sampling_ratio/min": 0.5038020610809326, "sampling/sampling_logp_difference/max": 1.0370073318481445, "sampling/sampling_logp_difference/mean": 0.01559330802410841, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 206.53125, "completions/mean_terminated_length": 206.53125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2860816717147827, "epoch": 1.1936274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.058299976937103, "kl": 0.0953373983502388, "learning_rate": 7.549191622877892e-07, "loss": 0.0152, "num_tokens": 30723781.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.556614875793457, "sampling/importance_sampling_ratio/mean": 0.9998517036437988, "sampling/importance_sampling_ratio/min": 0.6611390113830566, "sampling/sampling_logp_difference/max": 0.44251346588134766, "sampling/sampling_logp_difference/mean": 0.014137894846498966, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 252.828125, "completions/mean_terminated_length": 252.828125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.32496657967567444, "epoch": 1.1948529411764706, "frac_reward_zero_std": 0.25, "grad_norm": 1.551164058632505, "kl": 0.10039462894201279, "learning_rate": 7.543060650617158e-07, "loss": 0.0045, "num_tokens": 30758474.0, "reward": 0.0, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6483101844787598, "sampling/importance_sampling_ratio/mean": 0.999916672706604, "sampling/importance_sampling_ratio/min": 0.4820709228515625, "sampling/sampling_logp_difference/max": 0.7296640872955322, "sampling/sampling_logp_difference/mean": 0.01448042131960392, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 181.09375, "completions/mean_terminated_length": 181.09375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.33883771300315857, "epoch": 1.196078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.0672261850240166, "kl": 0.1487603634595871, "learning_rate": 7.53692451610441e-07, "loss": 0.0177, "num_tokens": 30789824.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5100367069244385, "sampling/importance_sampling_ratio/mean": 1.0001670122146606, "sampling/importance_sampling_ratio/min": 0.37719595432281494, "sampling/sampling_logp_difference/max": 0.9749904870986938, "sampling/sampling_logp_difference/mean": 0.016527269035577774, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 224.953125, "completions/mean_terminated_length": 224.953125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.30383163690567017, "epoch": 1.1973039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.43357950972283, "kl": 0.10215874016284943, "learning_rate": 7.530783231795614e-07, "loss": -0.065, "num_tokens": 30821213.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.785274863243103, "sampling/importance_sampling_ratio/mean": 0.9999973177909851, "sampling/importance_sampling_ratio/min": 0.6181599497795105, "sampling/sampling_logp_difference/max": 0.579572319984436, "sampling/sampling_logp_difference/mean": 0.015095638111233711, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 212.84375, "completions/mean_terminated_length": 212.84375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2441040575504303, "epoch": 1.1985294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.3686507933610939, "kl": 0.08347011357545853, "learning_rate": 7.524636810157188e-07, "loss": -0.0328, "num_tokens": 30853107.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8847352266311646, "sampling/importance_sampling_ratio/mean": 0.999988317489624, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.6337873935699463, "sampling/sampling_logp_difference/mean": 0.013035210780799389, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 176.046875, "completions/mean_terminated_length": 176.046875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2641918659210205, "epoch": 1.1997549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.3487198731382968, "kl": 0.0982346385717392, "learning_rate": 7.518485263665977e-07, "loss": -0.0045, "num_tokens": 30881318.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992034435272217, "sampling/importance_sampling_ratio/min": 0.49549606442451477, "sampling/sampling_logp_difference/max": 0.7284934520721436, "sampling/sampling_logp_difference/mean": 0.017063260078430176, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 228.484375, "completions/mean_terminated_length": 228.484375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.32671964168548584, "epoch": 1.2009803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.9774460006360761, "kl": 0.11252576112747192, "learning_rate": 7.512328604809232e-07, "loss": 0.0087, "num_tokens": 30910549.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.558348298072815, "sampling/importance_sampling_ratio/mean": 0.9993785619735718, "sampling/importance_sampling_ratio/min": 0.4324365556240082, "sampling/sampling_logp_difference/max": 0.8383196592330933, "sampling/sampling_logp_difference/mean": 0.01594432070851326, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 235.65625, "completions/mean_terminated_length": 235.65625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.300775945186615, "epoch": 1.2022058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.6703505763764348, "kl": 0.09750582277774811, "learning_rate": 7.506166846084579e-07, "loss": -0.0365, "num_tokens": 30943855.0, "reward": -0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.649430513381958, "sampling/importance_sampling_ratio/mean": 1.000500202178955, "sampling/importance_sampling_ratio/min": 0.5074718594551086, "sampling/sampling_logp_difference/max": 0.6783139705657959, "sampling/sampling_logp_difference/mean": 0.015568524599075317, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 275.171875, "completions/mean_terminated_length": 275.171875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3914821743965149, "epoch": 1.2034313725490196, "frac_reward_zero_std": 0.25, "grad_norm": 1.9709409411135073, "kl": 0.09965787827968597, "learning_rate": 7.5e-07, "loss": -0.0209, "num_tokens": 30991098.0, "reward": 0.65625, "reward_std": 0.6223389506340027, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5761357545852661, "sampling/importance_sampling_ratio/mean": 1.000213861465454, "sampling/importance_sampling_ratio/min": 0.40983694791793823, "sampling/sampling_logp_difference/max": 0.891995906829834, "sampling/sampling_logp_difference/mean": 0.01726599782705307, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.19782614707946777, "epoch": 1.204656862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.73829266014413, "kl": 0.1357603371143341, "learning_rate": 7.493828079073801e-07, "loss": 0.0341, "num_tokens": 31015618.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5277687311172485, "sampling/importance_sampling_ratio/mean": 0.9999630451202393, "sampling/importance_sampling_ratio/min": 0.4468963146209717, "sampling/sampling_logp_difference/max": 0.8054287433624268, "sampling/sampling_logp_difference/mean": 0.014209914021193981, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 168.84375, "completions/mean_terminated_length": 168.84375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.22640889883041382, "epoch": 1.2058823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 1.1397833334244651, "kl": 0.10726204514503479, "learning_rate": 7.487651095834588e-07, "loss": 0.0156, "num_tokens": 31040888.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.443230152130127, "sampling/importance_sampling_ratio/mean": 0.9990359544754028, "sampling/importance_sampling_ratio/min": 0.6210910677909851, "sampling/sampling_logp_difference/max": 0.47627758979797363, "sampling/sampling_logp_difference/mean": 0.012890784069895744, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 249.34375, "completions/mean_terminated_length": 249.34375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4012042284011841, "epoch": 1.2071078431372548, "frac_reward_zero_std": 0.0, "grad_norm": 2.2776803800898437, "kl": 0.12288743257522583, "learning_rate": 7.481469062821251e-07, "loss": -0.0047, "num_tokens": 31074350.0, "reward": 0.15625, "reward_std": 0.769389271736145, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.674807071685791, "sampling/importance_sampling_ratio/mean": 0.9999292492866516, "sampling/importance_sampling_ratio/min": 0.5288627743721008, "sampling/sampling_logp_difference/max": 0.637026309967041, "sampling/sampling_logp_difference/mean": 0.016513977199792862, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 221.84375, "completions/mean_terminated_length": 221.84375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3115932047367096, "epoch": 1.2083333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 1.6245114122560547, "kl": 0.11574351787567139, "learning_rate": 7.47528199258292e-07, "loss": 0.0316, "num_tokens": 31107060.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5633171796798706, "sampling/importance_sampling_ratio/mean": 1.000051736831665, "sampling/importance_sampling_ratio/min": 0.5911699533462524, "sampling/sampling_logp_difference/max": 0.525651752948761, "sampling/sampling_logp_difference/mean": 0.015016937628388405, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 169.859375, "completions/mean_terminated_length": 169.859375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.24668557941913605, "epoch": 1.2095588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.136974081092079, "kl": 0.12169291079044342, "learning_rate": 7.469089897678957e-07, "loss": -0.0077, "num_tokens": 31131163.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5935778617858887, "sampling/importance_sampling_ratio/mean": 0.9997788667678833, "sampling/importance_sampling_ratio/min": 0.48457133769989014, "sampling/sampling_logp_difference/max": 0.7244906425476074, "sampling/sampling_logp_difference/mean": 0.012976177036762238, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 179.390625, "completions/mean_terminated_length": 179.390625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.34161376953125, "epoch": 1.2107843137254901, "frac_reward_zero_std": 0.5, "grad_norm": 1.6586332707412077, "kl": 0.20643259584903717, "learning_rate": 7.462892790678925e-07, "loss": 0.0198, "num_tokens": 31160276.0, "reward": 0.65625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5905529260635376, "sampling/importance_sampling_ratio/mean": 1.000783085823059, "sampling/importance_sampling_ratio/min": 0.6451686024665833, "sampling/sampling_logp_difference/max": 0.4640817642211914, "sampling/sampling_logp_difference/mean": 0.015325578860938549, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 203.765625, "completions/mean_terminated_length": 203.765625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.32241612672805786, "epoch": 1.2120098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.2146281829114274, "kl": 0.07475961744785309, "learning_rate": 7.456690684162556e-07, "loss": 0.0035, "num_tokens": 31187781.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.9699103832244873, "sampling/importance_sampling_ratio/mean": 0.9992858171463013, "sampling/importance_sampling_ratio/min": 0.25922614336013794, "sampling/sampling_logp_difference/max": 1.350054383277893, "sampling/sampling_logp_difference/mean": 0.016333363950252533, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 183.078125, "completions/mean_terminated_length": 183.078125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3116165101528168, "epoch": 1.213235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.2658097084896718, "kl": 0.09635313600301743, "learning_rate": 7.450483590719736e-07, "loss": -0.0309, "num_tokens": 31228314.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.7604209184646606, "sampling/importance_sampling_ratio/mean": 1.0004222393035889, "sampling/importance_sampling_ratio/min": 0.4276825189590454, "sampling/sampling_logp_difference/max": 0.8493741750717163, "sampling/sampling_logp_difference/mean": 0.016576137393712997, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 165.109375, "completions/mean_terminated_length": 165.109375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3262573480606079, "epoch": 1.2144607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.240263089907162, "kl": 0.09773312509059906, "learning_rate": 7.444271522950468e-07, "loss": -0.0025, "num_tokens": 31255633.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5864391326904297, "sampling/importance_sampling_ratio/mean": 1.0003514289855957, "sampling/importance_sampling_ratio/min": 0.5683965682983398, "sampling/sampling_logp_difference/max": 0.5649359226226807, "sampling/sampling_logp_difference/mean": 0.014754691161215305, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 189.234375, "completions/mean_terminated_length": 189.234375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.22865843772888184, "epoch": 1.215686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.036086823453182544, "kl": 0.08012405782938004, "learning_rate": 7.438054493464859e-07, "loss": 0.0008, "num_tokens": 31288560.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001484155654907, "sampling/importance_sampling_ratio/min": 0.4468447268009186, "sampling/sampling_logp_difference/max": 0.8055441379547119, "sampling/sampling_logp_difference/mean": 0.012340845540165901, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 180.359375, "completions/mean_terminated_length": 180.359375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2875712513923645, "epoch": 1.2169117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.05782685284992099, "kl": 0.10270027816295624, "learning_rate": 7.431832514883081e-07, "loss": 0.001, "num_tokens": 31316295.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6122339963912964, "sampling/importance_sampling_ratio/mean": 1.0004626512527466, "sampling/importance_sampling_ratio/min": 0.5098070502281189, "sampling/sampling_logp_difference/max": 0.6737229824066162, "sampling/sampling_logp_difference/mean": 0.014431476593017578, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 164.796875, "completions/mean_terminated_length": 164.796875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2780784070491791, "epoch": 1.218137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.299354881607356, "kl": 0.0973915159702301, "learning_rate": 7.42560559983536e-07, "loss": 0.0034, "num_tokens": 31345178.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4421542882919312, "sampling/importance_sampling_ratio/mean": 0.9997124075889587, "sampling/importance_sampling_ratio/min": 0.590282142162323, "sampling/sampling_logp_difference/max": 0.5271546244621277, "sampling/sampling_logp_difference/mean": 0.014276275411248207, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 207.40625, "completions/mean_terminated_length": 207.40625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.33022767305374146, "epoch": 1.219362745098039, "frac_reward_zero_std": 0.25, "grad_norm": 1.9557828840884965, "kl": 0.12490814924240112, "learning_rate": 7.419373760961939e-07, "loss": 0.0236, "num_tokens": 31378676.0, "reward": 0.84375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996752738952637, "sampling/importance_sampling_ratio/min": 0.5823451280593872, "sampling/sampling_logp_difference/max": 1.0023880004882812, "sampling/sampling_logp_difference/mean": 0.016804654151201248, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 179.28125, "completions/mean_terminated_length": 179.28125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.30481797456741333, "epoch": 1.2205882352941178, "frac_reward_zero_std": 0.25, "grad_norm": 2.502793416021743, "kl": 0.153112530708313, "learning_rate": 7.413137010913054e-07, "loss": -0.062, "num_tokens": 31405878.0, "reward": -0.03125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6448822021484375, "sampling/importance_sampling_ratio/mean": 0.9992611408233643, "sampling/importance_sampling_ratio/min": 0.4682300388813019, "sampling/sampling_logp_difference/max": 0.7587955594062805, "sampling/sampling_logp_difference/mean": 0.015971675515174866, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 218.828125, "completions/mean_terminated_length": 218.828125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.37005671858787537, "epoch": 1.221813725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.6671530498676614, "kl": 0.14781561493873596, "learning_rate": 7.406895362348915e-07, "loss": -0.029, "num_tokens": 31442507.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7278977632522583, "sampling/importance_sampling_ratio/mean": 1.0003068447113037, "sampling/importance_sampling_ratio/min": 0.6104914546012878, "sampling/sampling_logp_difference/max": 0.546905517578125, "sampling/sampling_logp_difference/mean": 0.016775382682681084, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 197.109375, "completions/mean_terminated_length": 197.109375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3034955859184265, "epoch": 1.2230392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.0030801566549996, "kl": 0.10263898223638535, "learning_rate": 7.400648827939671e-07, "loss": 0.0125, "num_tokens": 31473474.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6770514249801636, "sampling/importance_sampling_ratio/mean": 0.9998710751533508, "sampling/importance_sampling_ratio/min": 0.6035663485527039, "sampling/sampling_logp_difference/max": 0.5170371532440186, "sampling/sampling_logp_difference/mean": 0.014926435425877571, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.363971471786499, "epoch": 1.224264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.2002474362045186, "kl": 0.12655779719352722, "learning_rate": 7.394397420365392e-07, "loss": 0.0484, "num_tokens": 31505114.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.524776577949524, "sampling/importance_sampling_ratio/mean": 0.9999206066131592, "sampling/importance_sampling_ratio/min": 0.47316774725914, "sampling/sampling_logp_difference/max": 0.7483053207397461, "sampling/sampling_logp_difference/mean": 0.014686089009046555, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 184.578125, "completions/mean_terminated_length": 184.578125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.26632946729660034, "epoch": 1.2254901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.5162950548729182, "kl": 0.07559464871883392, "learning_rate": 7.388141152316038e-07, "loss": -0.0356, "num_tokens": 31533007.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6545816659927368, "sampling/importance_sampling_ratio/mean": 1.0002281665802002, "sampling/importance_sampling_ratio/min": 0.24965912103652954, "sampling/sampling_logp_difference/max": 1.3876588344573975, "sampling/sampling_logp_difference/mean": 0.01353788748383522, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 153.421875, "completions/mean_terminated_length": 153.421875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.24606212973594666, "epoch": 1.2267156862745099, "frac_reward_zero_std": 0.5, "grad_norm": 2.521539633292881, "kl": 0.11632443964481354, "learning_rate": 7.381880036491439e-07, "loss": 0.0603, "num_tokens": 31555610.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6533600091934204, "sampling/importance_sampling_ratio/mean": 0.9998239278793335, "sampling/importance_sampling_ratio/min": 0.5483725070953369, "sampling/sampling_logp_difference/max": 0.6008005142211914, "sampling/sampling_logp_difference/mean": 0.013158413581550121, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 231.484375, "completions/mean_terminated_length": 231.484375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.38222363591194153, "epoch": 1.2279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.523169943336086, "kl": 0.11676372587680817, "learning_rate": 7.375614085601264e-07, "loss": -0.0913, "num_tokens": 31591001.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999423623085022, "sampling/importance_sampling_ratio/min": 0.6058565974235535, "sampling/sampling_logp_difference/max": 1.1164112091064453, "sampling/sampling_logp_difference/mean": 0.01587112993001938, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 163.46875, "completions/mean_terminated_length": 163.46875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.30908840894699097, "epoch": 1.2291666666666667, "frac_reward_zero_std": 0.25, "grad_norm": 1.991581597290809, "kl": 0.21817518770694733, "learning_rate": 7.369343312364993e-07, "loss": 0.0015, "num_tokens": 31616423.0, "reward": 0.625, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9695045948028564, "sampling/importance_sampling_ratio/mean": 0.9995729923248291, "sampling/importance_sampling_ratio/min": 0.6912212371826172, "sampling/sampling_logp_difference/max": 0.6777820587158203, "sampling/sampling_logp_difference/mean": 0.015119457617402077, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 179.671875, "completions/mean_terminated_length": 179.671875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3759731650352478, "epoch": 1.2303921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 2.0610345809582244, "kl": 0.17271912097930908, "learning_rate": 7.363067729511901e-07, "loss": 0.0265, "num_tokens": 31648354.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000411033630371, "sampling/importance_sampling_ratio/min": 0.5543676614761353, "sampling/sampling_logp_difference/max": 0.7612829208374023, "sampling/sampling_logp_difference/mean": 0.016802560538053513, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3747866153717041, "epoch": 1.2316176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 1.8112533158466604, "kl": 0.12410871684551239, "learning_rate": 7.356787349781022e-07, "loss": 0.0169, "num_tokens": 31681882.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5346338748931885, "sampling/importance_sampling_ratio/mean": 0.9999476671218872, "sampling/importance_sampling_ratio/min": 0.5783976912498474, "sampling/sampling_logp_difference/max": 0.5474936962127686, "sampling/sampling_logp_difference/mean": 0.017098799347877502, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 202.734375, "completions/mean_terminated_length": 202.734375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3643556833267212, "epoch": 1.232843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.7475408566935873, "kl": 0.15187852084636688, "learning_rate": 7.350502185921131e-07, "loss": -0.0309, "num_tokens": 31713049.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.8180066347122192, "sampling/importance_sampling_ratio/mean": 1.0002341270446777, "sampling/importance_sampling_ratio/min": 0.6139999032020569, "sampling/sampling_logp_difference/max": 0.597740650177002, "sampling/sampling_logp_difference/mean": 0.016775190830230713, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 186.890625, "completions/mean_terminated_length": 186.890625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.34378689527511597, "epoch": 1.2340686274509804, "frac_reward_zero_std": 0.25, "grad_norm": 2.246849376728109, "kl": 0.16027387976646423, "learning_rate": 7.344212250690711e-07, "loss": 0.0148, "num_tokens": 31739170.0, "reward": 0.28125, "reward_std": 0.6337460875511169, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 0.99985671043396, "sampling/importance_sampling_ratio/min": 0.4597095847129822, "sampling/sampling_logp_difference/max": 0.7771602869033813, "sampling/sampling_logp_difference/mean": 0.01568588614463806, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 207.3125, "completions/mean_terminated_length": 207.3125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.27392303943634033, "epoch": 1.2352941176470589, "frac_reward_zero_std": 0.25, "grad_norm": 2.1483814390497153, "kl": 0.1423334777355194, "learning_rate": 7.337917556857934e-07, "loss": 0.0245, "num_tokens": 31773126.0, "reward": 0.53125, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6795238256454468, "sampling/importance_sampling_ratio/mean": 1.0003316402435303, "sampling/importance_sampling_ratio/min": 0.6329535245895386, "sampling/sampling_logp_difference/max": 0.5185103416442871, "sampling/sampling_logp_difference/mean": 0.01408257894217968, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 188.578125, "completions/mean_terminated_length": 188.578125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3587127923965454, "epoch": 1.2365196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 1.0526170599458136, "kl": 0.13530303537845612, "learning_rate": 7.331618117200625e-07, "loss": 0.0041, "num_tokens": 31805003.0, "reward": -0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6602283716201782, "sampling/importance_sampling_ratio/mean": 1.0001368522644043, "sampling/importance_sampling_ratio/min": 0.42615729570388794, "sampling/sampling_logp_difference/max": 0.8529467582702637, "sampling/sampling_logp_difference/mean": 0.015614226460456848, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 174.96875, "completions/mean_terminated_length": 174.96875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2448197603225708, "epoch": 1.2377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03888673529603373, "kl": 0.08840734511613846, "learning_rate": 7.325313944506253e-07, "loss": 0.0009, "num_tokens": 31836025.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004358291625977, "sampling/importance_sampling_ratio/min": 0.22012530267238617, "sampling/sampling_logp_difference/max": 1.5135583877563477, "sampling/sampling_logp_difference/mean": 0.013241377659142017, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 158.078125, "completions/mean_terminated_length": 158.078125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.20651528239250183, "epoch": 1.2389705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.04904226566753591, "kl": 0.07498851418495178, "learning_rate": 7.319005051571885e-07, "loss": 0.0007, "num_tokens": 31861294.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996029138565063, "sampling/importance_sampling_ratio/min": 0.6086891293525696, "sampling/sampling_logp_difference/max": 0.8189167976379395, "sampling/sampling_logp_difference/mean": 0.012868411839008331, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 194.046875, "completions/mean_terminated_length": 194.046875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3027922809123993, "epoch": 1.2401960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.2038356318333772, "kl": 0.09024074673652649, "learning_rate": 7.312691451204177e-07, "loss": -0.0035, "num_tokens": 31894001.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5910779237747192, "sampling/importance_sampling_ratio/mean": 1.0005966424942017, "sampling/importance_sampling_ratio/min": 0.604347288608551, "sampling/sampling_logp_difference/max": 0.5036063194274902, "sampling/sampling_logp_difference/mean": 0.01586068421602249, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 158.234375, "completions/mean_terminated_length": 158.234375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.19949010014533997, "epoch": 1.241421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.06606497988914023, "kl": 0.08616028726100922, "learning_rate": 7.306373156219335e-07, "loss": 0.0009, "num_tokens": 31918672.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6798814535140991, "sampling/importance_sampling_ratio/mean": 1.0005948543548584, "sampling/importance_sampling_ratio/min": 0.6569399833679199, "sampling/sampling_logp_difference/max": 0.5187232494354248, "sampling/sampling_logp_difference/mean": 0.010515892878174782, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 223.390625, "completions/mean_terminated_length": 223.390625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.33404579758644104, "epoch": 1.2426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.9159336896469366, "kl": 0.09262588620185852, "learning_rate": 7.300050179443099e-07, "loss": 0.0153, "num_tokens": 31952777.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997861981391907, "sampling/importance_sampling_ratio/min": 0.43403762578964233, "sampling/sampling_logp_difference/max": 0.8346240520477295, "sampling/sampling_logp_difference/mean": 0.015576720237731934, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 192.9375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.4179092347621918, "epoch": 1.2438725490196079, "frac_reward_zero_std": 0.25, "grad_norm": 2.142086344004228, "kl": 0.15286509692668915, "learning_rate": 7.293722533710714e-07, "loss": 0.0182, "num_tokens": 31999365.0, "reward": 0.09375, "reward_std": 0.676956295967102, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5148899555206299, "sampling/importance_sampling_ratio/mean": 0.9996585249900818, "sampling/importance_sampling_ratio/min": 0.3519838750362396, "sampling/sampling_logp_difference/max": 1.0441699028015137, "sampling/sampling_logp_difference/mean": 0.019419480115175247, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.34504783153533936, "epoch": 1.2450980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.875871222576556, "kl": 0.15173397958278656, "learning_rate": 7.287390231866893e-07, "loss": -0.043, "num_tokens": 32026509.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6614950895309448, "sampling/importance_sampling_ratio/mean": 1.0002130270004272, "sampling/importance_sampling_ratio/min": 0.4344994127750397, "sampling/sampling_logp_difference/max": 0.8335607051849365, "sampling/sampling_logp_difference/mean": 0.017975609749555588, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 207.171875, "completions/mean_terminated_length": 207.171875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.27351927757263184, "epoch": 1.2463235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.8365602884455642, "kl": 0.13922347128391266, "learning_rate": 7.281053286765815e-07, "loss": -0.0077, "num_tokens": 32057976.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.829240083694458, "sampling/importance_sampling_ratio/mean": 0.9998570680618286, "sampling/importance_sampling_ratio/min": 0.475933700799942, "sampling/sampling_logp_difference/max": 0.7424767017364502, "sampling/sampling_logp_difference/mean": 0.013677925802767277, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 235.953125, "completions/mean_terminated_length": 235.953125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.31339412927627563, "epoch": 1.2475490196078431, "frac_reward_zero_std": 0.25, "grad_norm": 1.751014458954254, "kl": 0.12213044613599777, "learning_rate": 7.274711711271073e-07, "loss": 0.0458, "num_tokens": 32090341.0, "reward": 0.4375, "reward_std": 0.6707825064659119, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6643818616867065, "sampling/importance_sampling_ratio/mean": 1.0001444816589355, "sampling/importance_sampling_ratio/min": 0.5219727158546448, "sampling/sampling_logp_difference/max": 0.6501400470733643, "sampling/sampling_logp_difference/mean": 0.015251624397933483, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 204.296875, "completions/mean_terminated_length": 204.296875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2566416263580322, "epoch": 1.2487745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.6707983897426926, "kl": 0.11317183077335358, "learning_rate": 7.268365518255665e-07, "loss": -0.0106, "num_tokens": 32119048.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.8755946159362793, "sampling/importance_sampling_ratio/mean": 1.000259280204773, "sampling/importance_sampling_ratio/min": 0.5523190498352051, "sampling/sampling_logp_difference/max": 0.6289258003234863, "sampling/sampling_logp_difference/mean": 0.014619041234254837, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 200.21875, "completions/mean_terminated_length": 200.21875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.347176194190979, "epoch": 1.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.06262725495643849, "kl": 0.15429773926734924, "learning_rate": 7.262014720601958e-07, "loss": 0.0014, "num_tokens": 32159574.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998721480369568, "sampling/importance_sampling_ratio/min": 0.5038028955459595, "sampling/sampling_logp_difference/max": 0.7020881175994873, "sampling/sampling_logp_difference/mean": 0.017093859612941742, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 242.828125, "completions/mean_terminated_length": 242.828125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2813074588775635, "epoch": 1.2512254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.4754270795558369, "kl": 0.07705482840538025, "learning_rate": 7.255659331201673e-07, "loss": -0.0177, "num_tokens": 32196523.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6573729515075684, "sampling/importance_sampling_ratio/mean": 0.9998900890350342, "sampling/importance_sampling_ratio/min": 0.5038020610809326, "sampling/sampling_logp_difference/max": 0.6855719089508057, "sampling/sampling_logp_difference/mean": 0.014529230073094368, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 190.859375, "completions/mean_terminated_length": 190.859375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3343523442745209, "epoch": 1.2524509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 1.7682001494721453, "kl": 0.12435860931873322, "learning_rate": 7.249299362955845e-07, "loss": -0.0104, "num_tokens": 32230322.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6121718883514404, "sampling/importance_sampling_ratio/mean": 1.0002822875976562, "sampling/importance_sampling_ratio/min": 0.6152330040931702, "sampling/sampling_logp_difference/max": 0.48575425148010254, "sampling/sampling_logp_difference/mean": 0.016807734966278076, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 163.984375, "completions/mean_terminated_length": 163.984375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3668976426124573, "epoch": 1.2536764705882353, "frac_reward_zero_std": 0.25, "grad_norm": 2.507965779502531, "kl": 0.23807550966739655, "learning_rate": 7.242934828774808e-07, "loss": -0.0069, "num_tokens": 32264401.0, "reward": -0.125, "reward_std": 0.6047805547714233, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4725977182388306, "sampling/importance_sampling_ratio/mean": 0.9996072053909302, "sampling/importance_sampling_ratio/min": 0.5234352350234985, "sampling/sampling_logp_difference/max": 0.6473419666290283, "sampling/sampling_logp_difference/mean": 0.019045095890760422, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 205.21875, "completions/mean_terminated_length": 205.21875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4307796359062195, "epoch": 1.2549019607843137, "frac_reward_zero_std": 0.25, "grad_norm": 1.9349461135817945, "kl": 0.1441027671098709, "learning_rate": 7.236565741578162e-07, "loss": 0.0065, "num_tokens": 32297695.0, "reward": 0.40625, "reward_std": 0.6331988573074341, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6223868131637573, "sampling/importance_sampling_ratio/mean": 0.9998844265937805, "sampling/importance_sampling_ratio/min": 0.576189398765564, "sampling/sampling_logp_difference/max": 0.551318883895874, "sampling/sampling_logp_difference/mean": 0.018998097628355026, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 228.09375, "completions/mean_terminated_length": 228.09375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3047144412994385, "epoch": 1.2561274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 1.8028179518326652, "kl": 0.07948464155197144, "learning_rate": 7.230192114294753e-07, "loss": -0.0933, "num_tokens": 32330021.0, "reward": 0.53125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4912073612213135, "sampling/importance_sampling_ratio/mean": 1.0001463890075684, "sampling/importance_sampling_ratio/min": 0.47132372856140137, "sampling/sampling_logp_difference/max": 0.7522101402282715, "sampling/sampling_logp_difference/mean": 0.014421386644244194, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3037158250808716, "epoch": 1.2573529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.5394101784886198, "kl": 0.0990850031375885, "learning_rate": 7.223813959862638e-07, "loss": 0.0078, "num_tokens": 32360069.0, "reward": 0.03125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000302791595459, "sampling/importance_sampling_ratio/min": 0.4870549738407135, "sampling/sampling_logp_difference/max": 0.7454228401184082, "sampling/sampling_logp_difference/mean": 0.01469477266073227, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 200.640625, "completions/mean_terminated_length": 200.640625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3100473880767822, "epoch": 1.258578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.9361945149371205, "kl": 0.121149942278862, "learning_rate": 7.217431291229067e-07, "loss": -0.0163, "num_tokens": 32392846.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007360219955444, "sampling/importance_sampling_ratio/mean": 0.9996852874755859, "sampling/importance_sampling_ratio/min": 0.4947281777858734, "sampling/sampling_logp_difference/max": 0.7037467956542969, "sampling/sampling_logp_difference/mean": 0.016091354191303253, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 179.90625, "completions/mean_terminated_length": 179.90625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2418641746044159, "epoch": 1.2598039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 5.861874824330707, "kl": 0.237470343708992, "learning_rate": 7.211044121350454e-07, "loss": 0.016, "num_tokens": 32420632.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.9645437002182007, "sampling/importance_sampling_ratio/mean": 0.9993852972984314, "sampling/importance_sampling_ratio/min": 0.08299688994884491, "sampling/sampling_logp_difference/max": 2.488952159881592, "sampling/sampling_logp_difference/mean": 0.015440763905644417, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 177.828125, "completions/mean_terminated_length": 177.828125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.29346632957458496, "epoch": 1.2610294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.3610814823819088, "kl": 0.13688868284225464, "learning_rate": 7.204652463192347e-07, "loss": -0.0255, "num_tokens": 32452589.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5071556568145752, "sampling/importance_sampling_ratio/mean": 1.000331163406372, "sampling/importance_sampling_ratio/min": 0.41115984320640564, "sampling/sampling_logp_difference/max": 0.8887733221054077, "sampling/sampling_logp_difference/mean": 0.01535726711153984, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 223.3125, "completions/mean_terminated_length": 223.3125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3583295941352844, "epoch": 1.2622549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.4573937518867763, "kl": 0.16331440210342407, "learning_rate": 7.198256329729411e-07, "loss": -0.0145, "num_tokens": 32489489.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6254981756210327, "sampling/importance_sampling_ratio/mean": 0.9997801780700684, "sampling/importance_sampling_ratio/min": 0.5264105200767517, "sampling/sampling_logp_difference/max": 0.6416739225387573, "sampling/sampling_logp_difference/mean": 0.016224460676312447, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3241284489631653, "epoch": 1.2634803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.8754633261538256, "kl": 0.18393567204475403, "learning_rate": 7.191855733945386e-07, "loss": -0.0036, "num_tokens": 32525889.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.737408995628357, "sampling/importance_sampling_ratio/mean": 0.9993525147438049, "sampling/importance_sampling_ratio/min": 0.4397071599960327, "sampling/sampling_logp_difference/max": 0.8216463327407837, "sampling/sampling_logp_difference/mean": 0.015761353075504303, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 198.90625, "completions/mean_terminated_length": 198.90625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3429109752178192, "epoch": 1.2647058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.7782923039338083, "kl": 0.14510124921798706, "learning_rate": 7.185450688833083e-07, "loss": 0.0195, "num_tokens": 32555339.0, "reward": 0.3125, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000004529953003, "sampling/importance_sampling_ratio/min": 0.5084730982780457, "sampling/sampling_logp_difference/max": 0.8317980766296387, "sampling/sampling_logp_difference/mean": 0.01605771854519844, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 173.734375, "completions/mean_terminated_length": 173.734375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.38110750913619995, "epoch": 1.2659313725490196, "frac_reward_zero_std": 0.25, "grad_norm": 2.3586280279621117, "kl": 0.18957901000976562, "learning_rate": 7.179041207394331e-07, "loss": -0.0105, "num_tokens": 32583706.0, "reward": 0.5, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.532178521156311, "sampling/importance_sampling_ratio/mean": 1.0000102519989014, "sampling/importance_sampling_ratio/min": 0.595439076423645, "sampling/sampling_logp_difference/max": 0.518456220626831, "sampling/sampling_logp_difference/mean": 0.016505541279911995, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 191.34375, "completions/mean_terminated_length": 191.34375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.34220486879348755, "epoch": 1.267156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.05646084522535247, "kl": 0.12253264337778091, "learning_rate": 7.172627302639975e-07, "loss": 0.0012, "num_tokens": 32619664.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6237683296203613, "sampling/importance_sampling_ratio/mean": 0.9988462924957275, "sampling/importance_sampling_ratio/min": 0.37885966897010803, "sampling/sampling_logp_difference/max": 0.9705893993377686, "sampling/sampling_logp_difference/mean": 0.017415886744856834, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 292.4375, "completions/mean_terminated_length": 292.4375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.366678923368454, "epoch": 1.2683823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.6286243291552, "kl": 0.09592580795288086, "learning_rate": 7.166208987589836e-07, "loss": 0.0442, "num_tokens": 32653788.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995908141136169, "sampling/importance_sampling_ratio/min": 0.5976837277412415, "sampling/sampling_logp_difference/max": 0.7184348106384277, "sampling/sampling_logp_difference/mean": 0.016546351835131645, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2045753300189972, "epoch": 1.2696078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 1.0608310464490693, "kl": 0.11045706272125244, "learning_rate": 7.159786275272686e-07, "loss": -0.0054, "num_tokens": 32680064.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6443026065826416, "sampling/importance_sampling_ratio/mean": 1.0000745058059692, "sampling/importance_sampling_ratio/min": 0.6105195879936218, "sampling/sampling_logp_difference/max": 0.4973163604736328, "sampling/sampling_logp_difference/mean": 0.011742750182747841, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 178.84375, "completions/mean_terminated_length": 178.84375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3212544918060303, "epoch": 1.2708333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0584832092626283, "kl": 0.14732259511947632, "learning_rate": 7.153359178726221e-07, "loss": 0.0016, "num_tokens": 32706326.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.96951162815094, "sampling/importance_sampling_ratio/mean": 0.9999465346336365, "sampling/importance_sampling_ratio/min": 0.06851734220981598, "sampling/sampling_logp_difference/max": 2.680668354034424, "sampling/sampling_logp_difference/mean": 0.015915201976895332, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 222.84375, "completions/mean_terminated_length": 222.84375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4380054175853729, "epoch": 1.2720588235294117, "frac_reward_zero_std": 0.25, "grad_norm": 1.928208629225113, "kl": 0.18012815713882446, "learning_rate": 7.146927710997046e-07, "loss": -0.0014, "num_tokens": 32737740.0, "reward": 0.6875, "reward_std": 0.5915650129318237, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4524939060211182, "sampling/importance_sampling_ratio/mean": 0.9998493790626526, "sampling/importance_sampling_ratio/min": 0.6089973449707031, "sampling/sampling_logp_difference/max": 0.4959414005279541, "sampling/sampling_logp_difference/mean": 0.017109330743551254, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 233.828125, "completions/mean_terminated_length": 233.828125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3463178873062134, "epoch": 1.2732843137254901, "frac_reward_zero_std": 0.25, "grad_norm": 1.7030425356642425, "kl": 0.13462179899215698, "learning_rate": 7.140491885140628e-07, "loss": 0.0272, "num_tokens": 32768657.0, "reward": 0.0, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.917688250541687, "sampling/importance_sampling_ratio/mean": 1.000036597251892, "sampling/importance_sampling_ratio/min": 0.6179336309432983, "sampling/sampling_logp_difference/max": 0.6511204242706299, "sampling/sampling_logp_difference/mean": 0.015579458326101303, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 270.21875, "completions/mean_terminated_length": 270.21875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.34624820947647095, "epoch": 1.2745098039215685, "frac_reward_zero_std": 0.25, "grad_norm": 10.609165028958328, "kl": 0.14227654039859772, "learning_rate": 7.134051714221286e-07, "loss": 0.0249, "num_tokens": 32805935.0, "reward": 0.5, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7949460744857788, "sampling/importance_sampling_ratio/mean": 0.9997808933258057, "sampling/importance_sampling_ratio/min": 0.38582944869995117, "sampling/sampling_logp_difference/max": 0.9523599147796631, "sampling/sampling_logp_difference/mean": 0.01671173796057701, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 186.71875, "completions/mean_terminated_length": 186.71875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.22002194821834564, "epoch": 1.2757352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.12063033022994081, "kl": 0.0766913890838623, "learning_rate": 7.127607211312162e-07, "loss": 0.0008, "num_tokens": 32832333.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.498414158821106, "sampling/importance_sampling_ratio/mean": 0.9999425411224365, "sampling/importance_sampling_ratio/min": 0.3760538101196289, "sampling/sampling_logp_difference/max": 0.9780230522155762, "sampling/sampling_logp_difference/mean": 0.012133192270994186, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 264.484375, "completions/mean_terminated_length": 264.484375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2804335057735443, "epoch": 1.2769607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.0508564541259473, "kl": 0.09116211533546448, "learning_rate": 7.121158389495185e-07, "loss": 0.0049, "num_tokens": 32865452.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003582239151, "sampling/importance_sampling_ratio/min": 0.34089013934135437, "sampling/sampling_logp_difference/max": 1.2095816135406494, "sampling/sampling_logp_difference/mean": 0.01316175889223814, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 285.703125, "completions/mean_terminated_length": 285.703125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3882972300052643, "epoch": 1.278186274509804, "frac_reward_zero_std": 0.25, "grad_norm": 1.6799637917123296, "kl": 0.12771184742450714, "learning_rate": 7.114705261861061e-07, "loss": 0.0512, "num_tokens": 32908553.0, "reward": 0.125, "reward_std": 0.644389271736145, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995465278625488, "sampling/importance_sampling_ratio/min": 0.4101741909980774, "sampling/sampling_logp_difference/max": 0.8911733627319336, "sampling/sampling_logp_difference/mean": 0.017560133710503578, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 243.359375, "completions/mean_terminated_length": 243.359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.26590538024902344, "epoch": 1.2794117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.4105436688831778, "kl": 0.14571696519851685, "learning_rate": 7.108247841509222e-07, "loss": 0.0174, "num_tokens": 32936736.0, "reward": 0.34375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998252391815186, "sampling/importance_sampling_ratio/min": 0.4056454002857208, "sampling/sampling_logp_difference/max": 0.9022759199142456, "sampling/sampling_logp_difference/mean": 0.013069100677967072, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 186.046875, "completions/mean_terminated_length": 186.046875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.30763909220695496, "epoch": 1.280637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.081812013135514, "kl": 0.2121243178844452, "learning_rate": 7.101786141547828e-07, "loss": 0.0059, "num_tokens": 32964211.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6281697750091553, "sampling/importance_sampling_ratio/mean": 1.0008862018585205, "sampling/importance_sampling_ratio/min": 0.62428218126297, "sampling/sampling_logp_difference/max": 0.4874565601348877, "sampling/sampling_logp_difference/mean": 0.015111139044165611, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 233.28125, "completions/mean_terminated_length": 233.28125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.30027326941490173, "epoch": 1.281862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.806171699479573, "kl": 0.0992993712425232, "learning_rate": 7.095320175093718e-07, "loss": 0.0359, "num_tokens": 32994757.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5194125175476074, "sampling/importance_sampling_ratio/mean": 1.000177025794983, "sampling/importance_sampling_ratio/min": 0.6330479383468628, "sampling/sampling_logp_difference/max": 0.45720911026000977, "sampling/sampling_logp_difference/mean": 0.013746829703450203, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 188.65625, "completions/mean_terminated_length": 188.65625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.21341879665851593, "epoch": 1.2830882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.1331586318601023, "kl": 0.14237648248672485, "learning_rate": 7.088849955272396e-07, "loss": 0.0338, "num_tokens": 33022623.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995142221450806, "sampling/importance_sampling_ratio/min": 0.6229760050773621, "sampling/sampling_logp_difference/max": 1.130134105682373, "sampling/sampling_logp_difference/mean": 0.011872018687427044, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 187.078125, "completions/mean_terminated_length": 187.078125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.361431747674942, "epoch": 1.284313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.6351310888941764, "kl": 0.1553775668144226, "learning_rate": 7.082375495217995e-07, "loss": 0.0002, "num_tokens": 33050388.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994962215423584, "sampling/importance_sampling_ratio/min": 0.06389383971691132, "sampling/sampling_logp_difference/max": 2.750532388687134, "sampling/sampling_logp_difference/mean": 0.01660950854420662, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 175.484375, "completions/mean_terminated_length": 175.484375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2529153823852539, "epoch": 1.2855392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.05582279030978907, "kl": 0.10453030467033386, "learning_rate": 7.075896808073263e-07, "loss": 0.001, "num_tokens": 33079779.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.861152172088623, "sampling/importance_sampling_ratio/mean": 1.001260757446289, "sampling/importance_sampling_ratio/min": 0.51512211561203, "sampling/sampling_logp_difference/max": 0.66335129737854, "sampling/sampling_logp_difference/mean": 0.013509858399629593, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2999563217163086, "epoch": 1.2867647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.4515053218908125, "kl": 0.12369360029697418, "learning_rate": 7.069413906989523e-07, "loss": 0.0016, "num_tokens": 33112915.0, "reward": 0.3125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5768550634384155, "sampling/importance_sampling_ratio/mean": 0.9995861053466797, "sampling/importance_sampling_ratio/min": 0.6017664074897766, "sampling/sampling_logp_difference/max": 0.5078859329223633, "sampling/sampling_logp_difference/mean": 0.013802142813801765, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 222.390625, "completions/mean_terminated_length": 222.390625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3194698095321655, "epoch": 1.2879901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.1232975024968728, "kl": 0.18103843927383423, "learning_rate": 7.062926805126652e-07, "loss": 0.0108, "num_tokens": 33145356.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006964206695557, "sampling/importance_sampling_ratio/min": 0.48957717418670654, "sampling/sampling_logp_difference/max": 0.9625020027160645, "sampling/sampling_logp_difference/mean": 0.016171928495168686, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 240.921875, "completions/mean_terminated_length": 240.921875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.30571770668029785, "epoch": 1.2892156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 1.0322747904837823, "kl": 0.09930259734392166, "learning_rate": 7.056435515653058e-07, "loss": 0.0178, "num_tokens": 33177431.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5560518503189087, "sampling/importance_sampling_ratio/mean": 0.9996767044067383, "sampling/importance_sampling_ratio/min": 0.502656102180481, "sampling/sampling_logp_difference/max": 0.6878490447998047, "sampling/sampling_logp_difference/mean": 0.01476350612938404, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.339089035987854, "epoch": 1.2904411764705883, "frac_reward_zero_std": 0.25, "grad_norm": 2.093188298417368, "kl": 0.16841793060302734, "learning_rate": 7.049940051745646e-07, "loss": 0.023, "num_tokens": 33206151.0, "reward": 0.5625, "reward_std": 0.6663130521774292, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7607289552688599, "sampling/importance_sampling_ratio/mean": 0.999548614025116, "sampling/importance_sampling_ratio/min": 0.5105680227279663, "sampling/sampling_logp_difference/max": 0.6722314357757568, "sampling/sampling_logp_difference/mean": 0.0169211458414793, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 230.078125, "completions/mean_terminated_length": 230.078125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.37655603885650635, "epoch": 1.2916666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 1.5857500327224365, "kl": 0.15081095695495605, "learning_rate": 7.043440426589795e-07, "loss": -0.0454, "num_tokens": 33243196.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8727679252624512, "sampling/importance_sampling_ratio/mean": 0.9996587634086609, "sampling/importance_sampling_ratio/min": 0.5912025570869446, "sampling/sampling_logp_difference/max": 0.6274175643920898, "sampling/sampling_logp_difference/mean": 0.01732090674340725, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 199.984375, "completions/mean_terminated_length": 199.984375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.22158865630626678, "epoch": 1.2928921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.05580966844163945, "kl": 0.12035124748945236, "learning_rate": 7.036936653379335e-07, "loss": 0.0011, "num_tokens": 33274587.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999060034751892, "sampling/importance_sampling_ratio/min": 0.5996983647346497, "sampling/sampling_logp_difference/max": 0.901757001876831, "sampling/sampling_logp_difference/mean": 0.013135458342730999, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 330.59375, "completions/mean_terminated_length": 330.59375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.39374738931655884, "epoch": 1.2941176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.7205187284383933, "kl": 0.11873811483383179, "learning_rate": 7.030428745316512e-07, "loss": -0.0245, "num_tokens": 33318177.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.62209153175354, "sampling/importance_sampling_ratio/mean": 0.9998387098312378, "sampling/importance_sampling_ratio/min": 0.43553099036216736, "sampling/sampling_logp_difference/max": 0.8311893939971924, "sampling/sampling_logp_difference/mean": 0.016345981508493423, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 202.0625, "completions/mean_terminated_length": 202.0625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.31683558225631714, "epoch": 1.295343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.5122700222221295, "kl": 0.13294917345046997, "learning_rate": 7.023916715611968e-07, "loss": 0.0392, "num_tokens": 33351877.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4794986248016357, "sampling/importance_sampling_ratio/mean": 0.9991467595100403, "sampling/importance_sampling_ratio/min": 0.6412928104400635, "sampling/sampling_logp_difference/max": 0.44426918029785156, "sampling/sampling_logp_difference/mean": 0.015316734090447426, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 224.21875, "completions/mean_terminated_length": 224.21875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.35916852951049805, "epoch": 1.2965686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.8416802163181198, "kl": 0.13629162311553955, "learning_rate": 7.017400577484712e-07, "loss": -0.0102, "num_tokens": 33382803.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8823031187057495, "sampling/importance_sampling_ratio/mean": 0.99980229139328, "sampling/importance_sampling_ratio/min": 0.637050986289978, "sampling/sampling_logp_difference/max": 0.6324961185455322, "sampling/sampling_logp_difference/mean": 0.01551731489598751, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 191.046875, "completions/mean_terminated_length": 191.046875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2954980731010437, "epoch": 1.2977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.052997823499318364, "kl": 0.13235683739185333, "learning_rate": 7.010880344162086e-07, "loss": 0.0013, "num_tokens": 33413926.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7516822814941406, "sampling/importance_sampling_ratio/mean": 0.9992790222167969, "sampling/importance_sampling_ratio/min": 0.5718526840209961, "sampling/sampling_logp_difference/max": 0.5605766773223877, "sampling/sampling_logp_difference/mean": 0.01648583449423313, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 275.078125, "completions/mean_terminated_length": 275.078125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2996733784675598, "epoch": 1.2990196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 1.298984373440655, "kl": 0.09220343828201294, "learning_rate": 7.004356028879758e-07, "loss": 0.0233, "num_tokens": 33451787.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6204760074615479, "sampling/importance_sampling_ratio/mean": 0.9997023344039917, "sampling/importance_sampling_ratio/min": 0.629865288734436, "sampling/sampling_logp_difference/max": 0.48271989822387695, "sampling/sampling_logp_difference/mean": 0.01601042039692402, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.24982041120529175, "epoch": 1.3002450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.24166632185575956, "kl": 0.1469212919473648, "learning_rate": 6.99782764488167e-07, "loss": 0.0016, "num_tokens": 33483859.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8205466270446777, "sampling/importance_sampling_ratio/mean": 0.9996812343597412, "sampling/importance_sampling_ratio/min": 0.6778234839439392, "sampling/sampling_logp_difference/max": 0.5991368293762207, "sampling/sampling_logp_difference/mean": 0.011886507272720337, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 246.671875, "completions/mean_terminated_length": 246.671875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2619977593421936, "epoch": 1.3014705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.9328385514279917, "kl": 0.10800004750490189, "learning_rate": 6.991295205420027e-07, "loss": 0.0032, "num_tokens": 33517422.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.574527382850647, "sampling/importance_sampling_ratio/mean": 1.000114917755127, "sampling/importance_sampling_ratio/min": 0.5232529640197754, "sampling/sampling_logp_difference/max": 0.6476902961730957, "sampling/sampling_logp_difference/mean": 0.013545207679271698, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 257.109375, "completions/mean_terminated_length": 257.109375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3410852551460266, "epoch": 1.3026960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.047787409747118745, "kl": 0.10063477605581284, "learning_rate": 6.984758723755272e-07, "loss": 0.001, "num_tokens": 33553045.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6462818384170532, "sampling/importance_sampling_ratio/mean": 1.0003209114074707, "sampling/importance_sampling_ratio/min": 0.415952205657959, "sampling/sampling_logp_difference/max": 0.8771849274635315, "sampling/sampling_logp_difference/mean": 0.015459725633263588, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 208.21875, "completions/mean_terminated_length": 208.21875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.41836845874786377, "epoch": 1.303921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.852438571334581, "kl": 0.15046635270118713, "learning_rate": 6.978218213156044e-07, "loss": -0.0373, "num_tokens": 33581507.0, "reward": 0.4375, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6007379293441772, "sampling/importance_sampling_ratio/mean": 1.000087022781372, "sampling/importance_sampling_ratio/min": 0.6114194989204407, "sampling/sampling_logp_difference/max": 0.4919719696044922, "sampling/sampling_logp_difference/mean": 0.01780632510781288, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 216.1875, "completions/mean_terminated_length": 216.1875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.27934664487838745, "epoch": 1.3051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.3666395790917611, "kl": 0.125428706407547, "learning_rate": 6.971673686899169e-07, "loss": 0.0093, "num_tokens": 33614175.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5971810817718506, "sampling/importance_sampling_ratio/mean": 0.9997473359107971, "sampling/importance_sampling_ratio/min": 0.5324405431747437, "sampling/sampling_logp_difference/max": 0.6302840709686279, "sampling/sampling_logp_difference/mean": 0.016285786405205727, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3244784474372864, "epoch": 1.3063725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.1379653564633323, "kl": 0.2047014981508255, "learning_rate": 6.965125158269618e-07, "loss": -0.0433, "num_tokens": 33644195.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6598302125930786, "sampling/importance_sampling_ratio/mean": 0.9999638199806213, "sampling/importance_sampling_ratio/min": 0.5362949371337891, "sampling/sampling_logp_difference/max": 0.6230709552764893, "sampling/sampling_logp_difference/mean": 0.01563986763358116, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 191.9375, "completions/mean_terminated_length": 191.9375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.35488128662109375, "epoch": 1.3075980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.5621760628019599, "kl": 0.14954595267772675, "learning_rate": 6.958572640560491e-07, "loss": 0.0138, "num_tokens": 33679647.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5813579559326172, "sampling/importance_sampling_ratio/mean": 1.0000293254852295, "sampling/importance_sampling_ratio/min": 0.6106756329536438, "sampling/sampling_logp_difference/max": 0.49318933486938477, "sampling/sampling_logp_difference/mean": 0.01592111587524414, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 179.40625, "completions/mean_terminated_length": 179.40625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.20306000113487244, "epoch": 1.3088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.3036170426143063, "kl": 0.09833575785160065, "learning_rate": 6.952016147072981e-07, "loss": 0.0, "num_tokens": 33706425.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.9271328449249268, "sampling/importance_sampling_ratio/mean": 0.9996373653411865, "sampling/importance_sampling_ratio/min": 0.3300285041332245, "sampling/sampling_logp_difference/max": 1.1085762977600098, "sampling/sampling_logp_difference/mean": 0.01394584123045206, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 243.359375, "completions/mean_terminated_length": 243.359375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.405498743057251, "epoch": 1.3100490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.2919334017094637, "kl": 0.1486983299255371, "learning_rate": 6.945455691116358e-07, "loss": 0.01, "num_tokens": 33740384.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6165982484817505, "sampling/importance_sampling_ratio/mean": 1.000389814376831, "sampling/importance_sampling_ratio/min": 0.6166492104530334, "sampling/sampling_logp_difference/max": 0.48345494270324707, "sampling/sampling_logp_difference/mean": 0.017727406695485115, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 179.296875, "completions/mean_terminated_length": 179.296875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2689400613307953, "epoch": 1.3112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.09351448609747967, "kl": 0.11540800333023071, "learning_rate": 6.938891286007928e-07, "loss": 0.0012, "num_tokens": 33775987.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5074286460876465, "sampling/importance_sampling_ratio/mean": 0.9998900890350342, "sampling/importance_sampling_ratio/min": 0.2538463771343231, "sampling/sampling_logp_difference/max": 1.3710259199142456, "sampling/sampling_logp_difference/mean": 0.014276232570409775, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 175.921875, "completions/mean_terminated_length": 175.921875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2785714268684387, "epoch": 1.3125, "frac_reward_zero_std": 0.5, "grad_norm": 2.0695506021075314, "kl": 0.09059837460517883, "learning_rate": 6.932322945073023e-07, "loss": -0.0602, "num_tokens": 33801374.0, "reward": -0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6554646492004395, "sampling/importance_sampling_ratio/mean": 1.0000858306884766, "sampling/importance_sampling_ratio/min": 0.6298375129699707, "sampling/sampling_logp_difference/max": 0.5040817260742188, "sampling/sampling_logp_difference/mean": 0.015221224166452885, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 255.65625, "completions/mean_terminated_length": 255.65625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.32079508900642395, "epoch": 1.3137254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 1.6987967605391736, "kl": 0.12066194415092468, "learning_rate": 6.925750681644953e-07, "loss": -0.1204, "num_tokens": 33833352.0, "reward": 0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995695948600769, "sampling/importance_sampling_ratio/min": 0.49790158867836, "sampling/sampling_logp_difference/max": 0.7732429504394531, "sampling/sampling_logp_difference/mean": 0.014812313951551914, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1903567910194397, "epoch": 1.3149509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.04255177341041909, "kl": 0.08062325417995453, "learning_rate": 6.919174509065003e-07, "loss": 0.0007, "num_tokens": 33874184.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999676942825317, "sampling/importance_sampling_ratio/min": 0.5171644687652588, "sampling/sampling_logp_difference/max": 0.8792343139648438, "sampling/sampling_logp_difference/mean": 0.012009510770440102, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 257.78125, "completions/mean_terminated_length": 257.78125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3101387023925781, "epoch": 1.3161764705882353, "frac_reward_zero_std": 0.25, "grad_norm": 1.7028719699463302, "kl": 0.1298815906047821, "learning_rate": 6.91259444068238e-07, "loss": -0.0294, "num_tokens": 33908906.0, "reward": 0.8125, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.9180221557617188, "sampling/importance_sampling_ratio/mean": 0.9998555779457092, "sampling/importance_sampling_ratio/min": 0.49833470582962036, "sampling/sampling_logp_difference/max": 0.6964833736419678, "sampling/sampling_logp_difference/mean": 0.0152537040412426, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 309.734375, "completions/mean_terminated_length": 309.734375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.42819422483444214, "epoch": 1.3174019607843137, "frac_reward_zero_std": 0.0, "grad_norm": 2.1147177454245716, "kl": 0.14049476385116577, "learning_rate": 6.906010489854209e-07, "loss": 0.0592, "num_tokens": 33950809.0, "reward": 0.21875, "reward_std": 0.8013203144073486, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6463440656661987, "sampling/importance_sampling_ratio/mean": 0.9998204708099365, "sampling/importance_sampling_ratio/min": 0.4735189378261566, "sampling/sampling_logp_difference/max": 0.747563362121582, "sampling/sampling_logp_difference/mean": 0.019438711926341057, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3980330526828766, "epoch": 1.3186274509803921, "frac_reward_zero_std": 0.25, "grad_norm": 1.8301895289917482, "kl": 0.14308233559131622, "learning_rate": 6.899422669945493e-07, "loss": -0.0192, "num_tokens": 33985737.0, "reward": 0.03125, "reward_std": 0.6970869898796082, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6340501308441162, "sampling/importance_sampling_ratio/mean": 1.0005192756652832, "sampling/importance_sampling_ratio/min": 0.4092217683792114, "sampling/sampling_logp_difference/max": 0.8934980630874634, "sampling/sampling_logp_difference/mean": 0.01768249273300171, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 177.734375, "completions/mean_terminated_length": 177.734375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.21226820349693298, "epoch": 1.3198529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 2.614710418153234, "kl": 0.10652191936969757, "learning_rate": 6.892830994329088e-07, "loss": -0.0369, "num_tokens": 34017496.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002856254577637, "sampling/importance_sampling_ratio/min": 0.5585871934890747, "sampling/sampling_logp_difference/max": 0.9647364616394043, "sampling/sampling_logp_difference/mean": 0.012589693069458008, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 285.765625, "completions/mean_terminated_length": 285.765625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4327370524406433, "epoch": 1.321078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.3803107836661115, "kl": 0.15147444605827332, "learning_rate": 6.886235476385681e-07, "loss": -0.0133, "num_tokens": 34053721.0, "reward": 0.15625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5612167119979858, "sampling/importance_sampling_ratio/mean": 1.0000461339950562, "sampling/importance_sampling_ratio/min": 0.4624379277229309, "sampling/sampling_logp_difference/max": 0.7712429165840149, "sampling/sampling_logp_difference/mean": 0.018006278201937675, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 182.109375, "completions/mean_terminated_length": 182.109375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2860751152038574, "epoch": 1.3223039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.07197268825494085, "kl": 0.15290674567222595, "learning_rate": 6.879636129503751e-07, "loss": 0.0014, "num_tokens": 34085664.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6207294464111328, "sampling/importance_sampling_ratio/mean": 1.000579595565796, "sampling/importance_sampling_ratio/min": 0.577124297618866, "sampling/sampling_logp_difference/max": 0.5496976375579834, "sampling/sampling_logp_difference/mean": 0.015663743019104004, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 221.53125, "completions/mean_terminated_length": 221.53125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.28195878863334656, "epoch": 1.3235294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 2.605452125919545, "kl": 0.08582913130521774, "learning_rate": 6.87303296707956e-07, "loss": 0.0488, "num_tokens": 34121666.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995183944702148, "sampling/importance_sampling_ratio/min": 0.11842865496873856, "sampling/sampling_logp_difference/max": 2.1334445476531982, "sampling/sampling_logp_difference/mean": 0.015778496861457825, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 222.203125, "completions/mean_terminated_length": 222.203125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.32386523485183716, "epoch": 1.3247549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 1.935591694995017, "kl": 0.10367701202630997, "learning_rate": 6.866426002517105e-07, "loss": -0.0449, "num_tokens": 34149423.0, "reward": 0.21875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6926884651184082, "sampling/importance_sampling_ratio/mean": 1.000137209892273, "sampling/importance_sampling_ratio/min": 0.5572013854980469, "sampling/sampling_logp_difference/max": 0.5848284959793091, "sampling/sampling_logp_difference/mean": 0.014832671731710434, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 174.21875, "completions/mean_terminated_length": 174.21875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2482524961233139, "epoch": 1.3259803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.9509560925915717, "kl": 0.10061395913362503, "learning_rate": 6.859815249228105e-07, "loss": 0.0079, "num_tokens": 34177229.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.576332926750183, "sampling/importance_sampling_ratio/mean": 0.9995920658111572, "sampling/importance_sampling_ratio/min": 0.5588234663009644, "sampling/sampling_logp_difference/max": 0.5819215774536133, "sampling/sampling_logp_difference/mean": 0.01229644101113081, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 187.046875, "completions/mean_terminated_length": 187.046875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.31346315145492554, "epoch": 1.3272058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.0175494789187907, "kl": 0.11647707223892212, "learning_rate": 6.853200720631972e-07, "loss": 0.0108, "num_tokens": 34204768.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6359094381332397, "sampling/importance_sampling_ratio/mean": 1.0006194114685059, "sampling/importance_sampling_ratio/min": 0.5764164924621582, "sampling/sampling_logp_difference/max": 0.5509247779846191, "sampling/sampling_logp_difference/mean": 0.016130639240145683, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 225.890625, "completions/mean_terminated_length": 225.890625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.33300894498825073, "epoch": 1.3284313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.7513658062777013, "kl": 0.11280956864356995, "learning_rate": 6.846582430155781e-07, "loss": -0.0062, "num_tokens": 34234137.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5556352138519287, "sampling/importance_sampling_ratio/mean": 0.9996647834777832, "sampling/importance_sampling_ratio/min": 0.5057145953178406, "sampling/sampling_logp_difference/max": 0.6817827224731445, "sampling/sampling_logp_difference/mean": 0.016297079622745514, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 172.796875, "completions/mean_terminated_length": 172.796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2712753713130951, "epoch": 1.329656862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.7740138265401932, "kl": 0.12398400157690048, "learning_rate": 6.839960391234242e-07, "loss": 0.0476, "num_tokens": 34258172.0, "reward": 0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003924369812012, "sampling/importance_sampling_ratio/min": 0.432391881942749, "sampling/sampling_logp_difference/max": 0.8384230136871338, "sampling/sampling_logp_difference/mean": 0.01533106155693531, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 223.65625, "completions/mean_terminated_length": 223.65625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.28661102056503296, "epoch": 1.3308823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 1.1567830622447894, "kl": 0.10410013794898987, "learning_rate": 6.833334617309672e-07, "loss": 0.045, "num_tokens": 34292022.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5311121940612793, "sampling/importance_sampling_ratio/mean": 1.0001766681671143, "sampling/importance_sampling_ratio/min": 0.5280182957649231, "sampling/sampling_logp_difference/max": 0.6386243104934692, "sampling/sampling_logp_difference/mean": 0.014018706977367401, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 241.109375, "completions/mean_terminated_length": 241.109375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2740551829338074, "epoch": 1.3321078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0361179328305387, "kl": 0.09827157109975815, "learning_rate": 6.826705121831976e-07, "loss": 0.0009, "num_tokens": 34323677.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5972678661346436, "sampling/importance_sampling_ratio/mean": 0.999886155128479, "sampling/importance_sampling_ratio/min": 0.643671989440918, "sampling/sampling_logp_difference/max": 0.468294620513916, "sampling/sampling_logp_difference/mean": 0.013634300790727139, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 292.15625, "completions/mean_terminated_length": 292.15625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.33552277088165283, "epoch": 1.3333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 1.4465584966332885, "kl": 0.09587956964969635, "learning_rate": 6.820071918258605e-07, "loss": 0.0061, "num_tokens": 34361143.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.921859860420227, "sampling/importance_sampling_ratio/mean": 1.0002851486206055, "sampling/importance_sampling_ratio/min": 0.06600768119096756, "sampling/sampling_logp_difference/max": 2.717984199523926, "sampling/sampling_logp_difference/mean": 0.0153445303440094, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 178.234375, "completions/mean_terminated_length": 178.234375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.24344758689403534, "epoch": 1.3345588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.043404515045103384, "kl": 0.09411592781543732, "learning_rate": 6.813435020054548e-07, "loss": 0.0009, "num_tokens": 34387958.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000336170196533, "sampling/importance_sampling_ratio/min": 0.610383152961731, "sampling/sampling_logp_difference/max": 0.8105227947235107, "sampling/sampling_logp_difference/mean": 0.013158331625163555, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 263.703125, "completions/mean_terminated_length": 263.703125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.31929922103881836, "epoch": 1.3357843137254901, "frac_reward_zero_std": 0.5, "grad_norm": 1.258805920791015, "kl": 0.09902656078338623, "learning_rate": 6.806794440692282e-07, "loss": -0.0302, "num_tokens": 34422083.0, "reward": -0.0625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.636869192123413, "sampling/importance_sampling_ratio/mean": 1.0007058382034302, "sampling/importance_sampling_ratio/min": 0.37227267026901245, "sampling/sampling_logp_difference/max": 0.9881287813186646, "sampling/sampling_logp_difference/mean": 0.015133077278733253, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 223.515625, "completions/mean_terminated_length": 223.515625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2561953067779541, "epoch": 1.3370098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.3665153423660428, "kl": 0.07908254861831665, "learning_rate": 6.800150193651767e-07, "loss": 0.0024, "num_tokens": 34453124.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.477170467376709, "sampling/importance_sampling_ratio/mean": 0.999785304069519, "sampling/importance_sampling_ratio/min": 0.6561281681060791, "sampling/sampling_logp_difference/max": 0.4213991165161133, "sampling/sampling_logp_difference/mean": 0.013078063726425171, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 199.59375, "completions/mean_terminated_length": 199.59375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2496260553598404, "epoch": 1.3382352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 1.374085590077728, "kl": 0.08954588323831558, "learning_rate": 6.793502292420401e-07, "loss": -0.008, "num_tokens": 34481610.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6981291770935059, "sampling/importance_sampling_ratio/mean": 1.0000662803649902, "sampling/importance_sampling_ratio/min": 0.6202826499938965, "sampling/sampling_logp_difference/max": 0.5295271873474121, "sampling/sampling_logp_difference/mean": 0.013239433988928795, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 250.21875, "completions/mean_terminated_length": 250.21875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.36997076869010925, "epoch": 1.3394607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.07126523930538059, "kl": 0.1038190945982933, "learning_rate": 6.786850750493005e-07, "loss": 0.001, "num_tokens": 34515576.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7066212892532349, "sampling/importance_sampling_ratio/mean": 1.0008347034454346, "sampling/importance_sampling_ratio/min": 0.6208910346031189, "sampling/sampling_logp_difference/max": 0.5345156192779541, "sampling/sampling_logp_difference/mean": 0.01807655580341816, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 160.84375, "completions/mean_terminated_length": 160.84375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.22015033662319183, "epoch": 1.340686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.046881469980766685, "kl": 0.07629378885030746, "learning_rate": 6.780195581371784e-07, "loss": 0.0008, "num_tokens": 34539902.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6254730224609375, "sampling/importance_sampling_ratio/mean": 1.0000666379928589, "sampling/importance_sampling_ratio/min": 0.6097134351730347, "sampling/sampling_logp_difference/max": 0.4947662353515625, "sampling/sampling_logp_difference/mean": 0.013527627103030682, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.260101318359375, "epoch": 1.3419117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.05144603117378995, "kl": 0.08464940637350082, "learning_rate": 6.773536798566313e-07, "loss": 0.0008, "num_tokens": 34571550.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002096891403198, "sampling/importance_sampling_ratio/min": 0.5105252861976624, "sampling/sampling_logp_difference/max": 0.8242547512054443, "sampling/sampling_logp_difference/mean": 0.013929269276559353, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 264.0, "completions/mean_terminated_length": 264.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.19407080113887787, "epoch": 1.343137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.7789398781421727, "kl": 0.060388315469026566, "learning_rate": 6.766874415593495e-07, "loss": -0.0081, "num_tokens": 34605774.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996485710144043, "sampling/importance_sampling_ratio/min": 0.44851693511009216, "sampling/sampling_logp_difference/max": 0.8018088340759277, "sampling/sampling_logp_difference/mean": 0.011173035018146038, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2143286168575287, "epoch": 1.344362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.05273733580224862, "kl": 0.08921480178833008, "learning_rate": 6.760208445977549e-07, "loss": 0.0008, "num_tokens": 34634022.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6457408666610718, "sampling/importance_sampling_ratio/mean": 1.000124216079712, "sampling/importance_sampling_ratio/min": 0.6015896797180176, "sampling/sampling_logp_difference/max": 0.5081796646118164, "sampling/sampling_logp_difference/mean": 0.01239019725471735, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 273.8125, "completions/mean_terminated_length": 273.8125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.30872780084609985, "epoch": 1.3455882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.06237159816268388, "kl": 0.08186507225036621, "learning_rate": 6.753538903249974e-07, "loss": 0.0007, "num_tokens": 34677674.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003366470336914, "sampling/importance_sampling_ratio/min": 0.4494738280773163, "sampling/sampling_logp_difference/max": 0.8946397304534912, "sampling/sampling_logp_difference/mean": 0.01739206723868847, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 254.3125, "completions/mean_terminated_length": 254.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.31094664335250854, "epoch": 1.346813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.2818491418402742, "kl": 0.10428554564714432, "learning_rate": 6.74686580094951e-07, "loss": 0.01, "num_tokens": 34711902.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001916885375977, "sampling/importance_sampling_ratio/min": 0.19906094670295715, "sampling/sampling_logp_difference/max": 1.614144206047058, "sampling/sampling_logp_difference/mean": 0.016226403415203094, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 229.203125, "completions/mean_terminated_length": 229.203125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.38202738761901855, "epoch": 1.3480392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.6089337264588108, "kl": 0.16241875290870667, "learning_rate": 6.740189152622142e-07, "loss": -0.0471, "num_tokens": 34746091.0, "reward": -0.125, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995103478431702, "sampling/importance_sampling_ratio/min": 0.040602829307317734, "sampling/sampling_logp_difference/max": 3.2039175033569336, "sampling/sampling_logp_difference/mean": 0.01960323192179203, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.1822962462902069, "epoch": 1.3492647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.08516365588250555, "kl": 0.07012251019477844, "learning_rate": 6.733508971821036e-07, "loss": 0.0007, "num_tokens": 34770747.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991965293884277, "sampling/importance_sampling_ratio/min": 0.45569705963134766, "sampling/sampling_logp_difference/max": 0.7859270572662354, "sampling/sampling_logp_difference/mean": 0.013009462505578995, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 271.703125, "completions/mean_terminated_length": 271.703125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3267415761947632, "epoch": 1.3504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.2458060734568923, "kl": 0.08820392191410065, "learning_rate": 6.726825272106538e-07, "loss": 0.0515, "num_tokens": 34806344.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999918341636658, "sampling/importance_sampling_ratio/min": 0.4708138108253479, "sampling/sampling_logp_difference/max": 0.7646852731704712, "sampling/sampling_logp_difference/mean": 0.01696830615401268, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 225.8125, "completions/mean_terminated_length": 225.8125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4195988178253174, "epoch": 1.3517156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 1.2420206724302587, "kl": 0.2157381772994995, "learning_rate": 6.720138067046134e-07, "loss": 0.023, "num_tokens": 34837500.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.8745075464248657, "sampling/importance_sampling_ratio/mean": 0.9999470114707947, "sampling/importance_sampling_ratio/min": 0.5703800916671753, "sampling/sampling_logp_difference/max": 0.6283459663391113, "sampling/sampling_logp_difference/mean": 0.018711727112531662, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 172.265625, "completions/mean_terminated_length": 172.265625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.20912107825279236, "epoch": 1.3529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.05031629823568618, "kl": 0.07606321573257446, "learning_rate": 6.713447370214431e-07, "loss": 0.0008, "num_tokens": 34863357.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5024638175964355, "sampling/importance_sampling_ratio/mean": 0.9996066689491272, "sampling/importance_sampling_ratio/min": 0.6483970284461975, "sampling/sampling_logp_difference/max": 0.43325209617614746, "sampling/sampling_logp_difference/mean": 0.012016498483717442, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 171.796875, "completions/mean_terminated_length": 171.796875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.26856446266174316, "epoch": 1.3541666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 1.4562655301043657, "kl": 0.09392675757408142, "learning_rate": 6.706753195193116e-07, "loss": 0.001, "num_tokens": 34890096.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8024160861968994, "sampling/importance_sampling_ratio/mean": 1.0000146627426147, "sampling/importance_sampling_ratio/min": 0.6119200587272644, "sampling/sampling_logp_difference/max": 0.5891280174255371, "sampling/sampling_logp_difference/mean": 0.015046817250549793, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 190.640625, "completions/mean_terminated_length": 190.640625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2494535744190216, "epoch": 1.3553921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.3153675097880058, "kl": 0.12353288382291794, "learning_rate": 6.700055555570941e-07, "loss": -0.0027, "num_tokens": 34919465.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6230236291885376, "sampling/importance_sampling_ratio/mean": 1.000239372253418, "sampling/importance_sampling_ratio/min": 0.513964056968689, "sampling/sampling_logp_difference/max": 0.665601909160614, "sampling/sampling_logp_difference/mean": 0.014158805832266808, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 182.578125, "completions/mean_terminated_length": 182.578125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.20572876930236816, "epoch": 1.3566176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 1.245647422311269, "kl": 0.07359683513641357, "learning_rate": 6.693354464943688e-07, "loss": 0.075, "num_tokens": 34946398.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5465470552444458, "sampling/importance_sampling_ratio/mean": 0.9993281960487366, "sampling/importance_sampling_ratio/min": 0.4414757788181305, "sampling/sampling_logp_difference/max": 0.8176320791244507, "sampling/sampling_logp_difference/mean": 0.012859884649515152, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 171.046875, "completions/mean_terminated_length": 171.046875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2153121381998062, "epoch": 1.357843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.044084523082514315, "kl": 0.08404094725847244, "learning_rate": 6.68664993691415e-07, "loss": 0.0008, "num_tokens": 34979681.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4022732973098755, "sampling/importance_sampling_ratio/mean": 1.000349760055542, "sampling/importance_sampling_ratio/min": 0.6051716804504395, "sampling/sampling_logp_difference/max": 0.5022430419921875, "sampling/sampling_logp_difference/mean": 0.012641921639442444, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 177.046875, "completions/mean_terminated_length": 177.046875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2505217492580414, "epoch": 1.3590686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.14886236711394396, "kl": 0.11417225748300552, "learning_rate": 6.679941985092092e-07, "loss": 0.001, "num_tokens": 35011028.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6209088563919067, "sampling/importance_sampling_ratio/mean": 0.9995545744895935, "sampling/importance_sampling_ratio/min": 0.2842315137386322, "sampling/sampling_logp_difference/max": 1.257966160774231, "sampling/sampling_logp_difference/mean": 0.01553020253777504, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 227.546875, "completions/mean_terminated_length": 227.546875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.32394784688949585, "epoch": 1.3602941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.899899238781701, "kl": 0.13107380270957947, "learning_rate": 6.673230623094231e-07, "loss": 0.0165, "num_tokens": 35044311.0, "reward": 0.28125, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993151426315308, "sampling/importance_sampling_ratio/min": 0.5468358397483826, "sampling/sampling_logp_difference/max": 0.8583822250366211, "sampling/sampling_logp_difference/mean": 0.016252432018518448, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 261.1875, "completions/mean_terminated_length": 261.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.37364470958709717, "epoch": 1.3615196078431373, "frac_reward_zero_std": 0.25, "grad_norm": 1.72268187710298, "kl": 0.1305997222661972, "learning_rate": 6.666515864544208e-07, "loss": -0.0002, "num_tokens": 35078419.0, "reward": 0.25, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.7393680810928345, "sampling/importance_sampling_ratio/mean": 1.0007290840148926, "sampling/importance_sampling_ratio/min": 0.5912945866584778, "sampling/sampling_logp_difference/max": 0.5535218715667725, "sampling/sampling_logp_difference/mean": 0.01725170388817787, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 172.234375, "completions/mean_terminated_length": 172.234375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.29257479310035706, "epoch": 1.3627450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 2.0340952596276525, "kl": 0.1277839094400406, "learning_rate": 6.659797723072558e-07, "loss": 0.0129, "num_tokens": 35109554.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8624109029769897, "sampling/importance_sampling_ratio/mean": 0.9998859167098999, "sampling/importance_sampling_ratio/min": 0.4957435131072998, "sampling/sampling_logp_difference/max": 0.7016966342926025, "sampling/sampling_logp_difference/mean": 0.01669422537088394, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 244.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2708520293235779, "epoch": 1.3639705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.063436147267886, "kl": 0.0937291830778122, "learning_rate": 6.653076212316681e-07, "loss": 0.0101, "num_tokens": 35144738.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8026752471923828, "sampling/importance_sampling_ratio/mean": 0.999823808670044, "sampling/importance_sampling_ratio/min": 0.25222671031951904, "sampling/sampling_logp_difference/max": 1.3774269819259644, "sampling/sampling_logp_difference/mean": 0.014111516997218132, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 195.015625, "completions/mean_terminated_length": 195.015625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.22389185428619385, "epoch": 1.3651960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.036642500568826744, "kl": 0.07788003236055374, "learning_rate": 6.646351345920818e-07, "loss": 0.0008, "num_tokens": 35175907.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7600457668304443, "sampling/importance_sampling_ratio/mean": 0.999549388885498, "sampling/importance_sampling_ratio/min": 0.6267169117927551, "sampling/sampling_logp_difference/max": 0.5653398036956787, "sampling/sampling_logp_difference/mean": 0.012162875384092331, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 338.09375, "completions/mean_terminated_length": 338.09375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.28994637727737427, "epoch": 1.366421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.8466283656220367, "kl": 0.07235518842935562, "learning_rate": 6.639623137536022e-07, "loss": -0.0171, "num_tokens": 35213273.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006005764007568, "sampling/importance_sampling_ratio/min": 0.5194491147994995, "sampling/sampling_logp_difference/max": 0.8757719993591309, "sampling/sampling_logp_difference/mean": 0.014356923289597034, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 239.265625, "completions/mean_terminated_length": 239.265625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2334747612476349, "epoch": 1.3676470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.1310088388813992, "kl": 0.08175957947969437, "learning_rate": 6.63289160082013e-07, "loss": -0.0167, "num_tokens": 35243978.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6492300033569336, "sampling/importance_sampling_ratio/mean": 1.0003918409347534, "sampling/importance_sampling_ratio/min": 0.4857369661331177, "sampling/sampling_logp_difference/max": 0.7220879793167114, "sampling/sampling_logp_difference/mean": 0.01374032348394394, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 204.953125, "completions/mean_terminated_length": 204.953125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3001249134540558, "epoch": 1.3688725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.08509560883550144, "kl": 0.09763385355472565, "learning_rate": 6.626156749437736e-07, "loss": 0.001, "num_tokens": 35274695.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9560965299606323, "sampling/importance_sampling_ratio/mean": 1.0007350444793701, "sampling/importance_sampling_ratio/min": 0.519588828086853, "sampling/sampling_logp_difference/max": 0.6709508895874023, "sampling/sampling_logp_difference/mean": 0.015779858455061913, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 223.546875, "completions/mean_terminated_length": 223.546875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2166321575641632, "epoch": 1.3700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.9519877488685602, "kl": 0.07133737206459045, "learning_rate": 6.619418597060159e-07, "loss": 0.0072, "num_tokens": 35306410.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9024752378463745, "sampling/importance_sampling_ratio/mean": 0.9997768998146057, "sampling/importance_sampling_ratio/min": 0.6491274833679199, "sampling/sampling_logp_difference/max": 0.6431558132171631, "sampling/sampling_logp_difference/mean": 0.011922700330615044, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 177.921875, "completions/mean_terminated_length": 177.921875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.23126693069934845, "epoch": 1.3713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.42967752946002885, "kl": 0.10150554031133652, "learning_rate": 6.612677157365425e-07, "loss": 0.001, "num_tokens": 35334837.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5501295328140259, "sampling/importance_sampling_ratio/mean": 1.000143051147461, "sampling/importance_sampling_ratio/min": 0.4596952497959137, "sampling/sampling_logp_difference/max": 0.7771915197372437, "sampling/sampling_logp_difference/mean": 0.01388323213905096, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 242.703125, "completions/mean_terminated_length": 242.703125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3640415072441101, "epoch": 1.3725490196078431, "frac_reward_zero_std": 0.25, "grad_norm": 1.7909021871087463, "kl": 0.1598997712135315, "learning_rate": 6.605932444038228e-07, "loss": -0.0379, "num_tokens": 35366578.0, "reward": 0.46875, "reward_std": 0.5431214570999146, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001885890960693, "sampling/importance_sampling_ratio/min": 0.5097939968109131, "sampling/sampling_logp_difference/max": 1.179419755935669, "sampling/sampling_logp_difference/mean": 0.01787789911031723, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 205.78125, "completions/mean_terminated_length": 205.78125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.31297239661216736, "epoch": 1.3737745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.06916975114202925, "kl": 0.11701078712940216, "learning_rate": 6.599184470769908e-07, "loss": 0.0012, "num_tokens": 35394180.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8037598133087158, "sampling/importance_sampling_ratio/mean": 0.9997196197509766, "sampling/importance_sampling_ratio/min": 0.535176694393158, "sampling/sampling_logp_difference/max": 0.6251583099365234, "sampling/sampling_logp_difference/mean": 0.016619134694337845, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 245.59375, "completions/mean_terminated_length": 245.59375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.24175047874450684, "epoch": 1.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.04624234806036233, "kl": 0.05887269228696823, "learning_rate": 6.592433251258422e-07, "loss": 0.0006, "num_tokens": 35432090.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.989425539970398, "sampling/importance_sampling_ratio/mean": 0.9999043941497803, "sampling/importance_sampling_ratio/min": 0.6298782229423523, "sampling/sampling_logp_difference/max": 0.6878459453582764, "sampling/sampling_logp_difference/mean": 0.012914846651256084, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 233.3125, "completions/mean_terminated_length": 233.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.23181766271591187, "epoch": 1.3762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.058806750135720655, "kl": 0.08856362104415894, "learning_rate": 6.58567879920832e-07, "loss": 0.0009, "num_tokens": 35464734.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995250105857849, "sampling/importance_sampling_ratio/min": 0.2612076997756958, "sampling/sampling_logp_difference/max": 1.3424394130706787, "sampling/sampling_logp_difference/mean": 0.014123495668172836, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 215.15625, "completions/mean_terminated_length": 215.15625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24334780871868134, "epoch": 1.3774509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 1.4454882512237588, "kl": 0.10482418537139893, "learning_rate": 6.578921128330714e-07, "loss": 0.0109, "num_tokens": 35492376.0, "reward": 0.59375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5745593309402466, "sampling/importance_sampling_ratio/mean": 0.9996858239173889, "sampling/importance_sampling_ratio/min": 0.5260617733001709, "sampling/sampling_logp_difference/max": 0.6423366069793701, "sampling/sampling_logp_difference/mean": 0.012269468046724796, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 189.109375, "completions/mean_terminated_length": 189.109375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.29685068130493164, "epoch": 1.3786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.08161177455666314, "kl": 0.12493439018726349, "learning_rate": 6.572160252343242e-07, "loss": 0.0013, "num_tokens": 35525247.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7915161848068237, "sampling/importance_sampling_ratio/mean": 0.999591588973999, "sampling/importance_sampling_ratio/min": 0.24142293632030487, "sampling/sampling_logp_difference/max": 1.4212050437927246, "sampling/sampling_logp_difference/mean": 0.018257491290569305, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 217.734375, "completions/mean_terminated_length": 217.734375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3483741879463196, "epoch": 1.3799019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.4250601586537104, "kl": 0.20661494135856628, "learning_rate": 6.565396184970059e-07, "loss": -0.0062, "num_tokens": 35559326.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.60890531539917, "sampling/importance_sampling_ratio/mean": 0.9998926520347595, "sampling/importance_sampling_ratio/min": 0.5127992630004883, "sampling/sampling_logp_difference/max": 0.6678707599639893, "sampling/sampling_logp_difference/mean": 0.017867445945739746, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 321.0625, "completions/mean_terminated_length": 321.0625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.23988881707191467, "epoch": 1.3811274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.04530622749224077, "kl": 0.06789304316043854, "learning_rate": 6.558628939941791e-07, "loss": 0.0006, "num_tokens": 35602274.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7737213373184204, "sampling/importance_sampling_ratio/mean": 0.999636709690094, "sampling/importance_sampling_ratio/min": 0.5437178015708923, "sampling/sampling_logp_difference/max": 0.6093249320983887, "sampling/sampling_logp_difference/mean": 0.013073192909359932, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 248.75, "completions/mean_terminated_length": 248.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3969167172908783, "epoch": 1.3823529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.1137259934363668, "kl": 0.11824201047420502, "learning_rate": 6.551858530995517e-07, "loss": -0.0029, "num_tokens": 35638738.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6356092691421509, "sampling/importance_sampling_ratio/mean": 1.000152349472046, "sampling/importance_sampling_ratio/min": 0.6223164200782776, "sampling/sampling_logp_difference/max": 0.49201536178588867, "sampling/sampling_logp_difference/mean": 0.016793951392173767, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 268.203125, "completions/mean_terminated_length": 268.203125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3699800372123718, "epoch": 1.383578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.9941134239924921, "kl": 0.10607214272022247, "learning_rate": 6.545084971874736e-07, "loss": -0.0054, "num_tokens": 35676591.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5631024837493896, "sampling/importance_sampling_ratio/mean": 0.9998665452003479, "sampling/importance_sampling_ratio/min": 0.5544012188911438, "sampling/sampling_logp_difference/max": 0.5898666381835938, "sampling/sampling_logp_difference/mean": 0.018597232177853584, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 310.046875, "completions/mean_terminated_length": 310.046875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2574818730354309, "epoch": 1.3848039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.8061084669900063, "kl": 0.06990421563386917, "learning_rate": 6.538308276329349e-07, "loss": 0.0002, "num_tokens": 35716466.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6064001321792603, "sampling/importance_sampling_ratio/mean": 0.9998332262039185, "sampling/importance_sampling_ratio/min": 0.6554578542709351, "sampling/sampling_logp_difference/max": 0.4739956855773926, "sampling/sampling_logp_difference/mean": 0.01295868493616581, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 183.234375, "completions/mean_terminated_length": 183.234375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2953099012374878, "epoch": 1.3860294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.883785990579241, "kl": 0.1907866895198822, "learning_rate": 6.531528458115614e-07, "loss": -0.0226, "num_tokens": 35743777.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000083446502686, "sampling/importance_sampling_ratio/min": 0.5334725975990295, "sampling/sampling_logp_difference/max": 0.843454122543335, "sampling/sampling_logp_difference/mean": 0.01571395993232727, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 253.359375, "completions/mean_terminated_length": 253.359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2790994346141815, "epoch": 1.3872549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.0671149126434625, "kl": 0.08887787163257599, "learning_rate": 6.524745530996136e-07, "loss": 0.0022, "num_tokens": 35778600.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001745223999023, "sampling/importance_sampling_ratio/min": 0.26177510619163513, "sampling/sampling_logp_difference/max": 1.3402695655822754, "sampling/sampling_logp_difference/mean": 0.01448032446205616, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 242.203125, "completions/mean_terminated_length": 242.203125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.2669624090194702, "epoch": 1.3884803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.2722808135566526, "kl": 0.0965794250369072, "learning_rate": 6.517959508739825e-07, "loss": -0.0085, "num_tokens": 35812917.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7493369579315186, "sampling/importance_sampling_ratio/mean": 1.0002140998840332, "sampling/importance_sampling_ratio/min": 0.6316691637039185, "sampling/sampling_logp_difference/max": 0.5592367649078369, "sampling/sampling_logp_difference/mean": 0.013571933843195438, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 304.609375, "completions/mean_terminated_length": 304.609375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2814604341983795, "epoch": 1.3897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.4215723394233764, "kl": 0.105937659740448, "learning_rate": 6.511170405121877e-07, "loss": 0.0101, "num_tokens": 35849356.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.864237904548645, "sampling/importance_sampling_ratio/mean": 1.0003492832183838, "sampling/importance_sampling_ratio/min": 0.4562400281429291, "sampling/sampling_logp_difference/max": 0.7847362756729126, "sampling/sampling_logp_difference/mean": 0.014745076186954975, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 189.84375, "completions/mean_terminated_length": 189.84375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2530762851238251, "epoch": 1.3909313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.3020371185266197, "kl": 0.15191222727298737, "learning_rate": 6.504378233923742e-07, "loss": -0.0035, "num_tokens": 35876066.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5546691417694092, "sampling/importance_sampling_ratio/mean": 0.9999812841415405, "sampling/importance_sampling_ratio/min": 0.5129217505455017, "sampling/sampling_logp_difference/max": 0.667631983757019, "sampling/sampling_logp_difference/mean": 0.014895346947014332, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3823487162590027, "epoch": 1.392156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.2954510462144206, "kl": 0.16310438513755798, "learning_rate": 6.497583008933097e-07, "loss": 0.038, "num_tokens": 35907866.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005600452423096, "sampling/importance_sampling_ratio/min": 0.3722725808620453, "sampling/sampling_logp_difference/max": 0.9881290197372437, "sampling/sampling_logp_difference/mean": 0.018685853108763695, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 255.34375, "completions/mean_terminated_length": 255.34375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.20661571621894836, "epoch": 1.3933823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.07365484221000322, "kl": 0.09507159888744354, "learning_rate": 6.490784743943818e-07, "loss": 0.0009, "num_tokens": 35939536.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.690643072128296, "sampling/importance_sampling_ratio/mean": 0.9999591112136841, "sampling/importance_sampling_ratio/min": 0.5260617733001709, "sampling/sampling_logp_difference/max": 0.6423366069793701, "sampling/sampling_logp_difference/mean": 0.012394268065690994, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 279.59375, "completions/mean_terminated_length": 279.59375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.366366446018219, "epoch": 1.3946078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 1.015971293057811, "kl": 0.11405825614929199, "learning_rate": 6.483983452755952e-07, "loss": 0.0197, "num_tokens": 35979478.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000872611999512, "sampling/importance_sampling_ratio/min": 0.37959447503089905, "sampling/sampling_logp_difference/max": 0.9686517715454102, "sampling/sampling_logp_difference/mean": 0.016720902174711227, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 308.359375, "completions/mean_terminated_length": 308.359375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.29545682668685913, "epoch": 1.3958333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 1.2149821217737968, "kl": 0.11053871363401413, "learning_rate": 6.477179149175692e-07, "loss": 0.0248, "num_tokens": 36021405.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.8086117506027222, "sampling/importance_sampling_ratio/mean": 1.0003973245620728, "sampling/importance_sampling_ratio/min": 0.5164161920547485, "sampling/sampling_logp_difference/max": 0.6608422994613647, "sampling/sampling_logp_difference/mean": 0.014341693371534348, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 251.453125, "completions/mean_terminated_length": 251.453125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.38097473978996277, "epoch": 1.3970588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 2.2981838917711737, "kl": 0.17838437855243683, "learning_rate": 6.470371847015341e-07, "loss": 0.0791, "num_tokens": 36059386.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994637966156006, "sampling/importance_sampling_ratio/min": 0.43644192814826965, "sampling/sampling_logp_difference/max": 0.9823462963104248, "sampling/sampling_logp_difference/mean": 0.01879509910941124, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 240.328125, "completions/mean_terminated_length": 240.328125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2859255075454712, "epoch": 1.3982843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.05652274558298719, "kl": 0.0896296501159668, "learning_rate": 6.463561560093292e-07, "loss": 0.0009, "num_tokens": 36093951.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5874453783035278, "sampling/importance_sampling_ratio/mean": 0.9997658729553223, "sampling/importance_sampling_ratio/min": 0.5317541360855103, "sampling/sampling_logp_difference/max": 0.6315741539001465, "sampling/sampling_logp_difference/mean": 0.015782658010721207, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 381.265625, "completions/mean_terminated_length": 381.265625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.31567683815956116, "epoch": 1.3995098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.1183489133905145, "kl": 0.07916092127561569, "learning_rate": 6.456748302233994e-07, "loss": 0.0091, "num_tokens": 36135840.0, "reward": 0.46875, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 0.9998644590377808, "sampling/importance_sampling_ratio/min": 0.482060045003891, "sampling/sampling_logp_difference/max": 0.7296866178512573, "sampling/sampling_logp_difference/mean": 0.01383579894900322, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.293121874332428, "epoch": 1.4007352941176472, "frac_reward_zero_std": 0.25, "grad_norm": 1.930311948823916, "kl": 0.15231949090957642, "learning_rate": 6.449932087267931e-07, "loss": -0.0106, "num_tokens": 36164016.0, "reward": -0.3125, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997024536132812, "sampling/importance_sampling_ratio/min": 0.5002604126930237, "sampling/sampling_logp_difference/max": 0.7171361446380615, "sampling/sampling_logp_difference/mean": 0.016127631068229675, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 234.4375, "completions/mean_terminated_length": 234.4375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.22176510095596313, "epoch": 1.4019607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.035660544051021835, "kl": 0.06697164475917816, "learning_rate": 6.443112929031586e-07, "loss": 0.0006, "num_tokens": 36193820.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6496421098709106, "sampling/importance_sampling_ratio/mean": 1.0004054307937622, "sampling/importance_sampling_ratio/min": 0.5589194297790527, "sampling/sampling_logp_difference/max": 0.5817499160766602, "sampling/sampling_logp_difference/mean": 0.012055369094014168, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 224.78125, "completions/mean_terminated_length": 224.78125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3289373517036438, "epoch": 1.403186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.975299659007041, "kl": 0.13227379322052002, "learning_rate": 6.43629084136742e-07, "loss": -0.0033, "num_tokens": 36225614.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004262924194336, "sampling/importance_sampling_ratio/min": 0.5649482607841492, "sampling/sampling_logp_difference/max": 0.8808517456054688, "sampling/sampling_logp_difference/mean": 0.01591620221734047, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 260.71875, "completions/mean_terminated_length": 260.71875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.287993848323822, "epoch": 1.4044117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.9604651256840849, "kl": 0.1282859891653061, "learning_rate": 6.429465838123838e-07, "loss": -0.0091, "num_tokens": 36259596.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8644553422927856, "sampling/importance_sampling_ratio/mean": 1.000173807144165, "sampling/importance_sampling_ratio/min": 0.4801802635192871, "sampling/sampling_logp_difference/max": 0.7335937023162842, "sampling/sampling_logp_difference/mean": 0.015946825966238976, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 285.234375, "completions/mean_terminated_length": 285.234375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2686070203781128, "epoch": 1.405637254901961, "frac_reward_zero_std": 0.25, "grad_norm": 1.6657734507791657, "kl": 0.10684768855571747, "learning_rate": 6.422637933155162e-07, "loss": 0.0146, "num_tokens": 36296059.0, "reward": 0.40625, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7520718574523926, "sampling/importance_sampling_ratio/mean": 0.9999882578849792, "sampling/importance_sampling_ratio/min": 0.5483993291854858, "sampling/sampling_logp_difference/max": 0.600751519203186, "sampling/sampling_logp_difference/mean": 0.014088155701756477, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.19562289118766785, "epoch": 1.406862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.05258491210013407, "kl": 0.08688782900571823, "learning_rate": 6.41580714032161e-07, "loss": 0.0009, "num_tokens": 36321523.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7471122741699219, "sampling/importance_sampling_ratio/mean": 0.9997127056121826, "sampling/importance_sampling_ratio/min": 0.2428247034549713, "sampling/sampling_logp_difference/max": 1.4154155254364014, "sampling/sampling_logp_difference/mean": 0.013245074078440666, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 206.28125, "completions/mean_terminated_length": 206.28125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2681342363357544, "epoch": 1.4080882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.06443269592493465, "kl": 0.1535661518573761, "learning_rate": 6.408973473489257e-07, "loss": 0.0015, "num_tokens": 36350437.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5772086381912231, "sampling/importance_sampling_ratio/mean": 0.9998694658279419, "sampling/importance_sampling_ratio/min": 0.48595550656318665, "sampling/sampling_logp_difference/max": 0.7216382026672363, "sampling/sampling_logp_difference/mean": 0.015651695430278778, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 243.515625, "completions/mean_terminated_length": 243.515625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3394845426082611, "epoch": 1.409313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.2212874294737475, "kl": 0.10652216523885727, "learning_rate": 6.402136946530014e-07, "loss": 0.007, "num_tokens": 36386086.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8720457553863525, "sampling/importance_sampling_ratio/mean": 1.0006232261657715, "sampling/importance_sampling_ratio/min": 0.5518118143081665, "sampling/sampling_logp_difference/max": 0.6270318031311035, "sampling/sampling_logp_difference/mean": 0.018039550632238388, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 268.421875, "completions/mean_terminated_length": 268.421875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35418879985809326, "epoch": 1.4105392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 2.071230277672197, "kl": 0.1232588142156601, "learning_rate": 6.395297573321597e-07, "loss": 0.0907, "num_tokens": 36418801.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6880244016647339, "sampling/importance_sampling_ratio/mean": 0.9997165203094482, "sampling/importance_sampling_ratio/min": 0.6164071559906006, "sampling/sampling_logp_difference/max": 0.5235588550567627, "sampling/sampling_logp_difference/mean": 0.015983447432518005, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 300.796875, "completions/mean_terminated_length": 300.796875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.31910884380340576, "epoch": 1.4117647058823528, "frac_reward_zero_std": 0.25, "grad_norm": 1.7287904445096636, "kl": 0.11793461441993713, "learning_rate": 6.388455367747502e-07, "loss": 0.0234, "num_tokens": 36456868.0, "reward": 0.0625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995290040969849, "sampling/importance_sampling_ratio/min": 0.5910096764564514, "sampling/sampling_logp_difference/max": 1.1088428497314453, "sampling/sampling_logp_difference/mean": 0.014415017329156399, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 267.75, "completions/mean_terminated_length": 267.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3419966697692871, "epoch": 1.4129901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 2.0424876632302857, "kl": 0.19268451631069183, "learning_rate": 6.38161034369697e-07, "loss": -0.125, "num_tokens": 36491668.0, "reward": 0.3125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.7639833688735962, "sampling/importance_sampling_ratio/mean": 1.0002565383911133, "sampling/importance_sampling_ratio/min": 0.6652889251708984, "sampling/sampling_logp_difference/max": 0.5675745010375977, "sampling/sampling_logp_difference/mean": 0.01583666168153286, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 250.75, "completions/mean_terminated_length": 250.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.32501575350761414, "epoch": 1.4142156862745099, "frac_reward_zero_std": 0.5, "grad_norm": 1.8449033563929855, "kl": 0.0944858193397522, "learning_rate": 6.37476251506497e-07, "loss": 0.0228, "num_tokens": 36523748.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6479363441467285, "sampling/importance_sampling_ratio/mean": 0.9998430013656616, "sampling/importance_sampling_ratio/min": 0.4959690570831299, "sampling/sampling_logp_difference/max": 0.7012417316436768, "sampling/sampling_logp_difference/mean": 0.015257453545928001, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 229.765625, "completions/mean_terminated_length": 229.765625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.37603646516799927, "epoch": 1.4154411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.3063201336657813, "kl": 0.15218672156333923, "learning_rate": 6.367911895752158e-07, "loss": -0.0294, "num_tokens": 36559605.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5568087100982666, "sampling/importance_sampling_ratio/mean": 1.0006072521209717, "sampling/importance_sampling_ratio/min": 0.523182213306427, "sampling/sampling_logp_difference/max": 0.6478254795074463, "sampling/sampling_logp_difference/mean": 0.0173235684633255, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 238.703125, "completions/mean_terminated_length": 238.703125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3450971245765686, "epoch": 1.4166666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 1.5653244022543134, "kl": 0.11214041709899902, "learning_rate": 6.361058499664855e-07, "loss": -0.0061, "num_tokens": 36595314.0, "reward": 0.03125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7519925832748413, "sampling/importance_sampling_ratio/mean": 1.0004363059997559, "sampling/importance_sampling_ratio/min": 0.44848787784576416, "sampling/sampling_logp_difference/max": 0.8018736839294434, "sampling/sampling_logp_difference/mean": 0.015167295932769775, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 204.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2972475290298462, "epoch": 1.4178921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.06121943003613818, "kl": 0.12594039738178253, "learning_rate": 6.354202340715026e-07, "loss": 0.0012, "num_tokens": 36627306.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5914101600646973, "sampling/importance_sampling_ratio/mean": 0.9999909400939941, "sampling/importance_sampling_ratio/min": 0.4500995874404907, "sampling/sampling_logp_difference/max": 0.7982864379882812, "sampling/sampling_logp_difference/mean": 0.015716087073087692, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 263.578125, "completions/mean_terminated_length": 263.578125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3475438952445984, "epoch": 1.4191176470588236, "frac_reward_zero_std": 0.25, "grad_norm": 1.6374156335189483, "kl": 0.13400861620903015, "learning_rate": 6.347343432820234e-07, "loss": 0.0278, "num_tokens": 36664527.0, "reward": 0.4375, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.586793303489685, "sampling/importance_sampling_ratio/mean": 0.9999744892120361, "sampling/importance_sampling_ratio/min": 0.6635606288909912, "sampling/sampling_logp_difference/max": 0.4617152214050293, "sampling/sampling_logp_difference/mean": 0.015900876373052597, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 184.1875, "completions/mean_terminated_length": 184.1875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.28351420164108276, "epoch": 1.420343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.07877137010457892, "kl": 0.11289854347705841, "learning_rate": 6.340481789903634e-07, "loss": 0.0011, "num_tokens": 36700043.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.738796353340149, "sampling/importance_sampling_ratio/mean": 1.0004609823226929, "sampling/importance_sampling_ratio/min": 0.5671127438545227, "sampling/sampling_logp_difference/max": 0.5671970844268799, "sampling/sampling_logp_difference/mean": 0.01656191609799862, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 266.890625, "completions/mean_terminated_length": 266.890625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.37652432918548584, "epoch": 1.4215686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.14840458178793, "kl": 0.12098322808742523, "learning_rate": 6.333617425893919e-07, "loss": -0.038, "num_tokens": 36733124.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5277866125106812, "sampling/importance_sampling_ratio/mean": 1.0003337860107422, "sampling/importance_sampling_ratio/min": 0.6173402667045593, "sampling/sampling_logp_difference/max": 0.48233485221862793, "sampling/sampling_logp_difference/mean": 0.016561204567551613, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 222.703125, "completions/mean_terminated_length": 222.703125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2620255649089813, "epoch": 1.4227941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 1.1591485140148, "kl": 0.09756740927696228, "learning_rate": 6.326750354725319e-07, "loss": -0.0047, "num_tokens": 36765137.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.821160078048706, "sampling/importance_sampling_ratio/mean": 1.0002214908599854, "sampling/importance_sampling_ratio/min": 0.5200066566467285, "sampling/sampling_logp_difference/max": 0.6539137363433838, "sampling/sampling_logp_difference/mean": 0.014374290592968464, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 223.015625, "completions/mean_terminated_length": 223.015625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.36535540223121643, "epoch": 1.4240196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.7028083486710426, "kl": 0.13686351478099823, "learning_rate": 6.319880590337548e-07, "loss": 0.0281, "num_tokens": 36798162.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.47568678855896, "sampling/importance_sampling_ratio/mean": 1.0003669261932373, "sampling/importance_sampling_ratio/min": 0.5781504511833191, "sampling/sampling_logp_difference/max": 0.5479211807250977, "sampling/sampling_logp_difference/mean": 0.0174210574477911, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 169.453125, "completions/mean_terminated_length": 169.453125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.25308871269226074, "epoch": 1.4252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.15882293046274884, "kl": 0.15826408565044403, "learning_rate": 6.313008146675799e-07, "loss": 0.0016, "num_tokens": 36828239.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6306155920028687, "sampling/importance_sampling_ratio/mean": 1.0002827644348145, "sampling/importance_sampling_ratio/min": 0.5011641383171082, "sampling/sampling_logp_difference/max": 0.690821647644043, "sampling/sampling_logp_difference/mean": 0.01709289662539959, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 229.296875, "completions/mean_terminated_length": 229.296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.28702783584594727, "epoch": 1.4264705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.211984767247007, "kl": 0.08903306722640991, "learning_rate": 6.306133037690692e-07, "loss": -0.0355, "num_tokens": 36860994.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6364332437515259, "sampling/importance_sampling_ratio/mean": 0.9997914433479309, "sampling/importance_sampling_ratio/min": 0.5128509998321533, "sampling/sampling_logp_difference/max": 0.6677699089050293, "sampling/sampling_logp_difference/mean": 0.014962265267968178, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 205.46875, "completions/mean_terminated_length": 205.46875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.18544542789459229, "epoch": 1.4276960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.3131834377855973, "kl": 0.0759623795747757, "learning_rate": 6.299255277338264e-07, "loss": 0.0247, "num_tokens": 36893152.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6589863300323486, "sampling/importance_sampling_ratio/mean": 0.9993994235992432, "sampling/importance_sampling_ratio/min": 0.548751711845398, "sampling/sampling_logp_difference/max": 0.6001091003417969, "sampling/sampling_logp_difference/mean": 0.011164880357682705, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 217.21875, "completions/mean_terminated_length": 217.21875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.27909189462661743, "epoch": 1.428921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.4916533217815588, "kl": 0.09997544437646866, "learning_rate": 6.292374879579934e-07, "loss": 0.0004, "num_tokens": 36921262.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6306602954864502, "sampling/importance_sampling_ratio/mean": 0.9999811053276062, "sampling/importance_sampling_ratio/min": 0.5853133797645569, "sampling/sampling_logp_difference/max": 0.5356078147888184, "sampling/sampling_logp_difference/mean": 0.013811073265969753, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 229.21875, "completions/mean_terminated_length": 229.21875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3222309947013855, "epoch": 1.4301470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.6905276313493822, "kl": 0.1145516037940979, "learning_rate": 6.285491858382473e-07, "loss": -0.0476, "num_tokens": 36955468.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7082451581954956, "sampling/importance_sampling_ratio/mean": 0.9996176958084106, "sampling/importance_sampling_ratio/min": 0.5860161781311035, "sampling/sampling_logp_difference/max": 0.5354666709899902, "sampling/sampling_logp_difference/mean": 0.014726577326655388, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 242.796875, "completions/mean_terminated_length": 242.796875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3567851185798645, "epoch": 1.4313725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.0993007660511787, "kl": 0.11437512934207916, "learning_rate": 6.278606227717978e-07, "loss": 0.0381, "num_tokens": 36993039.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.602103590965271, "sampling/importance_sampling_ratio/mean": 0.9991492033004761, "sampling/importance_sampling_ratio/min": 0.6059837341308594, "sampling/sampling_logp_difference/max": 0.5009021759033203, "sampling/sampling_logp_difference/mean": 0.017158126458525658, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.340457022190094, "epoch": 1.4325980392156863, "frac_reward_zero_std": 0.25, "grad_norm": 2.2564928648237985, "kl": 0.1473807990550995, "learning_rate": 6.271718001563843e-07, "loss": -0.0095, "num_tokens": 37024791.0, "reward": 0.375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998472929000854, "sampling/importance_sampling_ratio/min": 0.48082515597343445, "sampling/sampling_logp_difference/max": 1.5006871223449707, "sampling/sampling_logp_difference/mean": 0.01723838783800602, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 264.65625, "completions/mean_terminated_length": 264.65625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.35243093967437744, "epoch": 1.4338235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 2.1063927522456454, "kl": 0.1826269030570984, "learning_rate": 6.264827193902731e-07, "loss": 0.0448, "num_tokens": 37063825.0, "reward": 0.65625, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5744361877441406, "sampling/importance_sampling_ratio/mean": 0.999392032623291, "sampling/importance_sampling_ratio/min": 0.5328908562660217, "sampling/sampling_logp_difference/max": 0.6294386386871338, "sampling/sampling_logp_difference/mean": 0.016563165932893753, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 221.6875, "completions/mean_terminated_length": 221.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3303709626197815, "epoch": 1.4350490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.224968075553671, "kl": 0.1414005309343338, "learning_rate": 6.257933818722542e-07, "loss": -0.0035, "num_tokens": 37098269.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6598443984985352, "sampling/importance_sampling_ratio/mean": 1.0001469850540161, "sampling/importance_sampling_ratio/min": 0.6217727065086365, "sampling/sampling_logp_difference/max": 0.5067238807678223, "sampling/sampling_logp_difference/mean": 0.01730150170624256, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 212.984375, "completions/mean_terminated_length": 212.984375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3266046941280365, "epoch": 1.4362745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 1.7800707386255215, "kl": 0.11052196472883224, "learning_rate": 6.251037890016395e-07, "loss": -0.0443, "num_tokens": 37127308.0, "reward": 0.28125, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.988501787185669, "sampling/importance_sampling_ratio/mean": 0.99956214427948, "sampling/importance_sampling_ratio/min": 0.46266159415245056, "sampling/sampling_logp_difference/max": 0.7707594037055969, "sampling/sampling_logp_difference/mean": 0.016007719561457634, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 207.984375, "completions/mean_terminated_length": 207.984375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3158145248889923, "epoch": 1.4375, "frac_reward_zero_std": 0.75, "grad_norm": 1.2159819180519693, "kl": 0.14949479699134827, "learning_rate": 6.244139421782587e-07, "loss": 0.0189, "num_tokens": 37154363.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6023591756820679, "sampling/importance_sampling_ratio/mean": 0.9998407959938049, "sampling/importance_sampling_ratio/min": 0.6270001530647278, "sampling/sampling_logp_difference/max": 0.47147703170776367, "sampling/sampling_logp_difference/mean": 0.014544149860739708, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 247.484375, "completions/mean_terminated_length": 247.484375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2983068823814392, "epoch": 1.4387254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 1.337779453550791, "kl": 0.0902748703956604, "learning_rate": 6.237238428024571e-07, "loss": -0.0021, "num_tokens": 37189466.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.663038730621338, "sampling/importance_sampling_ratio/mean": 1.0002652406692505, "sampling/importance_sampling_ratio/min": 0.6163129210472107, "sampling/sampling_logp_difference/max": 0.5086464881896973, "sampling/sampling_logp_difference/mean": 0.015390568412840366, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 180.3125, "completions/mean_terminated_length": 180.3125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.25251686573028564, "epoch": 1.4399509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 2.211544059427017, "kl": 0.15830758213996887, "learning_rate": 6.230334922750929e-07, "loss": -0.127, "num_tokens": 37215150.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6885360479354858, "sampling/importance_sampling_ratio/mean": 0.99983811378479, "sampling/importance_sampling_ratio/min": 0.46046993136405945, "sampling/sampling_logp_difference/max": 0.7755076885223389, "sampling/sampling_logp_difference/mean": 0.013846715912222862, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 201.40625, "completions/mean_terminated_length": 201.40625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2913873493671417, "epoch": 1.4411764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.4796317158140682, "kl": 0.13480722904205322, "learning_rate": 6.223428919975338e-07, "loss": 0.0152, "num_tokens": 37246936.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5692412853240967, "sampling/importance_sampling_ratio/mean": 1.000030755996704, "sampling/importance_sampling_ratio/min": 0.36347436904907227, "sampling/sampling_logp_difference/max": 1.0120465755462646, "sampling/sampling_logp_difference/mean": 0.014956055209040642, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 230.578125, "completions/mean_terminated_length": 230.578125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.35280194878578186, "epoch": 1.4424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.2057679055290507, "kl": 0.13378706574440002, "learning_rate": 6.216520433716544e-07, "loss": 0.0099, "num_tokens": 37279405.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4461430311203003, "sampling/importance_sampling_ratio/mean": 0.9984460473060608, "sampling/importance_sampling_ratio/min": 0.4309263527393341, "sampling/sampling_logp_difference/max": 0.84181809425354, "sampling/sampling_logp_difference/mean": 0.017107432708144188, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 193.15625, "completions/mean_terminated_length": 193.15625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3700255751609802, "epoch": 1.4436274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.2374435367190664, "kl": 0.12329597026109695, "learning_rate": 6.209609477998338e-07, "loss": 0.0842, "num_tokens": 37312615.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992519617080688, "sampling/importance_sampling_ratio/min": 0.6034554243087769, "sampling/sampling_logp_difference/max": 0.7990565299987793, "sampling/sampling_logp_difference/mean": 0.018614256754517555, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 159.984375, "completions/mean_terminated_length": 159.984375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.33156856894493103, "epoch": 1.4448529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.6806987115221546, "kl": 0.13869813084602356, "learning_rate": 6.202696066849524e-07, "loss": -0.008, "num_tokens": 37337350.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.93405282497406, "sampling/importance_sampling_ratio/mean": 0.9998618364334106, "sampling/importance_sampling_ratio/min": 0.4030509293079376, "sampling/sampling_logp_difference/max": 0.9086923599243164, "sampling/sampling_logp_difference/mean": 0.017438728362321854, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 173.609375, "completions/mean_terminated_length": 173.609375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3063565492630005, "epoch": 1.446078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.7073736999319093, "kl": 0.11664046347141266, "learning_rate": 6.195780214303887e-07, "loss": -0.0128, "num_tokens": 37371341.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8195306062698364, "sampling/importance_sampling_ratio/mean": 1.0002837181091309, "sampling/importance_sampling_ratio/min": 0.618789553642273, "sampling/sampling_logp_difference/max": 0.5985785722732544, "sampling/sampling_logp_difference/mean": 0.01647639088332653, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 233.90625, "completions/mean_terminated_length": 233.90625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.27668413519859314, "epoch": 1.4473039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.0096771439449546, "kl": 0.11705988645553589, "learning_rate": 6.188861934400171e-07, "loss": -0.0418, "num_tokens": 37409799.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7785027027130127, "sampling/importance_sampling_ratio/mean": 1.0001142024993896, "sampling/importance_sampling_ratio/min": 0.4169924557209015, "sampling/sampling_logp_difference/max": 0.8746871948242188, "sampling/sampling_logp_difference/mean": 0.014162404462695122, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 218.671875, "completions/mean_terminated_length": 218.671875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.30626606941223145, "epoch": 1.4485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.05596439045356626, "kl": 0.09142415225505829, "learning_rate": 6.181941241182043e-07, "loss": 0.0009, "num_tokens": 37446546.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4812122583389282, "sampling/importance_sampling_ratio/mean": 0.9997881054878235, "sampling/importance_sampling_ratio/min": 0.47619307041168213, "sampling/sampling_logp_difference/max": 0.7419319152832031, "sampling/sampling_logp_difference/mean": 0.01617203839123249, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 155.828125, "completions/mean_terminated_length": 155.828125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2763192057609558, "epoch": 1.4497549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 2.5668047830796996, "kl": 0.17243480682373047, "learning_rate": 6.175018148698076e-07, "loss": -0.0257, "num_tokens": 37474951.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.702722191810608, "sampling/importance_sampling_ratio/mean": 1.0007953643798828, "sampling/importance_sampling_ratio/min": 0.5423819422721863, "sampling/sampling_logp_difference/max": 0.611784815788269, "sampling/sampling_logp_difference/mean": 0.01690756157040596, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 211.21875, "completions/mean_terminated_length": 211.21875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.32851842045783997, "epoch": 1.4509803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.2834727068697949, "kl": 0.17326781153678894, "learning_rate": 6.168092671001705e-07, "loss": 0.0135, "num_tokens": 37509093.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8608806133270264, "sampling/importance_sampling_ratio/mean": 1.0005269050598145, "sampling/importance_sampling_ratio/min": 0.5097377300262451, "sampling/sampling_logp_difference/max": 0.6738590002059937, "sampling/sampling_logp_difference/mean": 0.017571967095136642, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 186.953125, "completions/mean_terminated_length": 186.953125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.28477370738983154, "epoch": 1.4522058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.04943827233911848, "kl": 0.10224197059869766, "learning_rate": 6.161164822151213e-07, "loss": 0.001, "num_tokens": 37539602.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5690999031066895, "sampling/importance_sampling_ratio/mean": 0.9998160600662231, "sampling/importance_sampling_ratio/min": 0.45969608426094055, "sampling/sampling_logp_difference/max": 0.7771897315979004, "sampling/sampling_logp_difference/mean": 0.015761321410536766, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 223.84375, "completions/mean_terminated_length": 223.84375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2302074134349823, "epoch": 1.4534313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.015837087750094, "kl": 0.08969669044017792, "learning_rate": 6.154234616209692e-07, "loss": -0.0176, "num_tokens": 37573496.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6544609069824219, "sampling/importance_sampling_ratio/mean": 1.0001399517059326, "sampling/importance_sampling_ratio/min": 0.5637956857681274, "sampling/sampling_logp_difference/max": 0.5730633735656738, "sampling/sampling_logp_difference/mean": 0.012367047369480133, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 220.28125, "completions/mean_terminated_length": 220.28125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3840216398239136, "epoch": 1.454656862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.456764309807822, "kl": 0.11951378732919693, "learning_rate": 6.147302067245028e-07, "loss": 0.0077, "num_tokens": 37603866.0, "reward": -0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5749192237854004, "sampling/importance_sampling_ratio/mean": 0.999834418296814, "sampling/importance_sampling_ratio/min": 0.059670716524124146, "sampling/sampling_logp_difference/max": 2.8189139366149902, "sampling/sampling_logp_difference/mean": 0.01715525984764099, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 197.015625, "completions/mean_terminated_length": 197.015625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3232305645942688, "epoch": 1.4558823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 1.3193424143390062, "kl": 0.10033085197210312, "learning_rate": 6.140367189329847e-07, "loss": -0.0531, "num_tokens": 37634203.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.7066165208816528, "sampling/importance_sampling_ratio/mean": 1.000067949295044, "sampling/importance_sampling_ratio/min": 0.47672104835510254, "sampling/sampling_logp_difference/max": 0.7408237457275391, "sampling/sampling_logp_difference/mean": 0.01664803922176361, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 184.015625, "completions/mean_terminated_length": 184.015625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3196144700050354, "epoch": 1.4571078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.11496818100434487, "kl": 0.12379920482635498, "learning_rate": 6.133429996541518e-07, "loss": 0.0011, "num_tokens": 37664444.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000017523765564, "sampling/importance_sampling_ratio/min": 0.11415062844753265, "sampling/sampling_logp_difference/max": 2.170236349105835, "sampling/sampling_logp_difference/mean": 0.01712869480252266, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 219.15625, "completions/mean_terminated_length": 219.15625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2887090742588043, "epoch": 1.4583333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 1.1348927038126526, "kl": 0.11597549915313721, "learning_rate": 6.1264905029621e-07, "loss": 0.0097, "num_tokens": 37700438.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6932430267333984, "sampling/importance_sampling_ratio/mean": 1.0000178813934326, "sampling/importance_sampling_ratio/min": 0.5735724568367004, "sampling/sampling_logp_difference/max": 0.5558710694313049, "sampling/sampling_logp_difference/mean": 0.015668664127588272, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 181.390625, "completions/mean_terminated_length": 181.390625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.28322315216064453, "epoch": 1.4595588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.06817020188729356, "kl": 0.14192894101142883, "learning_rate": 6.119548722678327e-07, "loss": 0.0014, "num_tokens": 37733519.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99979567527771, "sampling/importance_sampling_ratio/min": 0.6217517852783203, "sampling/sampling_logp_difference/max": 0.7287606000900269, "sampling/sampling_logp_difference/mean": 0.016558753326535225, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.28417468070983887, "epoch": 1.4607843137254901, "frac_reward_zero_std": 0.75, "grad_norm": 1.2412866245053409, "kl": 0.10667260736227036, "learning_rate": 6.112604669781572e-07, "loss": 0.0356, "num_tokens": 37763191.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6555081605911255, "sampling/importance_sampling_ratio/mean": 1.0000934600830078, "sampling/importance_sampling_ratio/min": 0.22631296515464783, "sampling/sampling_logp_difference/max": 1.4858365058898926, "sampling/sampling_logp_difference/mean": 0.014470456168055534, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 240.765625, "completions/mean_terminated_length": 240.765625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3636624813079834, "epoch": 1.4620098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.578333476370098, "kl": 0.11750581115484238, "learning_rate": 6.105658358367822e-07, "loss": 0.038, "num_tokens": 37797688.0, "reward": 0.625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5071572065353394, "sampling/importance_sampling_ratio/mean": 1.000314474105835, "sampling/importance_sampling_ratio/min": 0.6122506260871887, "sampling/sampling_logp_difference/max": 0.4906136393547058, "sampling/sampling_logp_difference/mean": 0.01553491409868002, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.219697505235672, "epoch": 1.4632352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.044637562856320645, "kl": 0.08123414218425751, "learning_rate": 6.098709802537653e-07, "loss": 0.0009, "num_tokens": 37821888.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9099587202072144, "sampling/importance_sampling_ratio/mean": 1.0004470348358154, "sampling/importance_sampling_ratio/min": 0.48237401247024536, "sampling/sampling_logp_difference/max": 0.729035496711731, "sampling/sampling_logp_difference/mean": 0.01361137256026268, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 207.03125, "completions/mean_terminated_length": 207.03125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2628161609172821, "epoch": 1.4644607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.4571893795286242, "kl": 0.08402369171380997, "learning_rate": 6.091759016396188e-07, "loss": -0.0356, "num_tokens": 37852498.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5961744785308838, "sampling/importance_sampling_ratio/mean": 1.0001986026763916, "sampling/importance_sampling_ratio/min": 0.49821826815605164, "sampling/sampling_logp_difference/max": 0.6967170238494873, "sampling/sampling_logp_difference/mean": 0.014230694621801376, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 159.4375, "completions/mean_terminated_length": 159.4375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2504669427871704, "epoch": 1.465686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.307576053585658, "kl": 0.12320666015148163, "learning_rate": 6.084806014053086e-07, "loss": 0.0013, "num_tokens": 37878766.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.530218482017517, "sampling/importance_sampling_ratio/mean": 0.9995049238204956, "sampling/importance_sampling_ratio/min": 0.5890116095542908, "sampling/sampling_logp_difference/max": 0.5293093323707581, "sampling/sampling_logp_difference/mean": 0.013893891125917435, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 173.84375, "completions/mean_terminated_length": 173.84375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.1905520260334015, "epoch": 1.4669117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.2072480243072812, "kl": 0.07715676724910736, "learning_rate": 6.077850809622498e-07, "loss": 0.038, "num_tokens": 37907428.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9979921579360962, "sampling/importance_sampling_ratio/mean": 1.0002038478851318, "sampling/importance_sampling_ratio/min": 0.6058018207550049, "sampling/sampling_logp_difference/max": 0.6921427249908447, "sampling/sampling_logp_difference/mean": 0.010474804788827896, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 183.953125, "completions/mean_terminated_length": 183.953125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.32976096868515015, "epoch": 1.468137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.6320427464177791, "kl": 0.19351902604103088, "learning_rate": 6.070893417223052e-07, "loss": 0.0134, "num_tokens": 37933073.0, "reward": 0.59375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6176202297210693, "sampling/importance_sampling_ratio/mean": 0.9999404549598694, "sampling/importance_sampling_ratio/min": 0.5917201042175293, "sampling/sampling_logp_difference/max": 0.5247215628623962, "sampling/sampling_logp_difference/mean": 0.016002189368009567, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 181.5625, "completions/mean_terminated_length": 181.5625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.31580162048339844, "epoch": 1.469362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.3271427980708945, "kl": 0.22780528664588928, "learning_rate": 6.06393385097781e-07, "loss": -0.0021, "num_tokens": 37964853.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004262924194336, "sampling/importance_sampling_ratio/min": 0.5923798084259033, "sampling/sampling_logp_difference/max": 1.3286752700805664, "sampling/sampling_logp_difference/mean": 0.014728747308254242, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 193.046875, "completions/mean_terminated_length": 193.046875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.26126304268836975, "epoch": 1.4705882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.753164343747916, "kl": 0.15420006215572357, "learning_rate": 6.056972125014254e-07, "loss": -0.0152, "num_tokens": 37994568.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4986714124679565, "sampling/importance_sampling_ratio/mean": 0.9999833106994629, "sampling/importance_sampling_ratio/min": 0.47390836477279663, "sampling/sampling_logp_difference/max": 0.7467412948608398, "sampling/sampling_logp_difference/mean": 0.012895776890218258, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 227.796875, "completions/mean_terminated_length": 227.796875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.32024574279785156, "epoch": 1.471813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.17119847616517, "kl": 0.11787056922912598, "learning_rate": 6.050008253464246e-07, "loss": 0.0141, "num_tokens": 38027835.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5791263580322266, "sampling/importance_sampling_ratio/mean": 0.9998366832733154, "sampling/importance_sampling_ratio/min": 0.5746538043022156, "sampling/sampling_logp_difference/max": 0.5539875030517578, "sampling/sampling_logp_difference/mean": 0.015224370174109936, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 187.53125, "completions/mean_terminated_length": 187.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.29485833644866943, "epoch": 1.4730392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.05094943383238978, "kl": 0.10031016170978546, "learning_rate": 6.043042250464004e-07, "loss": 0.001, "num_tokens": 38062349.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7593786716461182, "sampling/importance_sampling_ratio/mean": 1.0004372596740723, "sampling/importance_sampling_ratio/min": 0.5719299912452698, "sampling/sampling_logp_difference/max": 0.5649607181549072, "sampling/sampling_logp_difference/mean": 0.017132576555013657, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 197.03125, "completions/mean_terminated_length": 197.03125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.30228111147880554, "epoch": 1.4742647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 1.3274662296376223, "kl": 0.16679570078849792, "learning_rate": 6.036074130154071e-07, "loss": -0.0177, "num_tokens": 38091711.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.624650478363037, "sampling/importance_sampling_ratio/mean": 1.0004812479019165, "sampling/importance_sampling_ratio/min": 0.5937528014183044, "sampling/sampling_logp_difference/max": 0.5212922096252441, "sampling/sampling_logp_difference/mean": 0.015419438481330872, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 220.03125, "completions/mean_terminated_length": 220.03125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27057573199272156, "epoch": 1.4754901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.8076838544415822, "kl": 0.07092368602752686, "learning_rate": 6.029103906679293e-07, "loss": -0.0013, "num_tokens": 38122433.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6192903518676758, "sampling/importance_sampling_ratio/mean": 0.9996510148048401, "sampling/importance_sampling_ratio/min": 0.5873081684112549, "sampling/sampling_logp_difference/max": 0.5322055816650391, "sampling/sampling_logp_difference/mean": 0.013554556295275688, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.23620840907096863, "epoch": 1.4767156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.04665933164818391, "kl": 0.08392497897148132, "learning_rate": 6.022131594188777e-07, "loss": 0.0008, "num_tokens": 38158353.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6152162551879883, "sampling/importance_sampling_ratio/mean": 0.9992532730102539, "sampling/importance_sampling_ratio/min": 0.5516251921653748, "sampling/sampling_logp_difference/max": 0.5948865413665771, "sampling/sampling_logp_difference/mean": 0.013747544959187508, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 207.859375, "completions/mean_terminated_length": 207.859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2698655128479004, "epoch": 1.4779411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.07123002636763835, "kl": 0.08243486285209656, "learning_rate": 6.01515720683588e-07, "loss": 0.0009, "num_tokens": 38185272.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6216869354248047, "sampling/importance_sampling_ratio/mean": 0.9996257424354553, "sampling/importance_sampling_ratio/min": 0.591215968132019, "sampling/sampling_logp_difference/max": 0.5255739688873291, "sampling/sampling_logp_difference/mean": 0.01550292782485485, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 212.953125, "completions/mean_terminated_length": 212.953125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.29221758246421814, "epoch": 1.4791666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 1.1799150266306437, "kl": 0.11821890622377396, "learning_rate": 6.008180758778166e-07, "loss": 0.0338, "num_tokens": 38216533.0, "reward": -0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998124837875366, "sampling/importance_sampling_ratio/min": 0.3483121991157532, "sampling/sampling_logp_difference/max": 1.0546560287475586, "sampling/sampling_logp_difference/mean": 0.017921222373843193, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 238.28125, "completions/mean_terminated_length": 238.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3860968351364136, "epoch": 1.4803921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.323393463562757, "kl": 0.14008009433746338, "learning_rate": 6.001202264177382e-07, "loss": 0.0007, "num_tokens": 38250951.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000505805015564, "sampling/importance_sampling_ratio/min": 0.34883177280426025, "sampling/sampling_logp_difference/max": 1.0531654357910156, "sampling/sampling_logp_difference/mean": 0.018849536776542664, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 202.71875, "completions/mean_terminated_length": 202.71875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2686987519264221, "epoch": 1.4816176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.04456395390141526, "kl": 0.08406025171279907, "learning_rate": 5.99422173719943e-07, "loss": 0.0008, "num_tokens": 38280661.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999376535415649, "sampling/importance_sampling_ratio/min": 0.5367775559425354, "sampling/sampling_logp_difference/max": 0.8054203987121582, "sampling/sampling_logp_difference/mean": 0.014889596030116081, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 192.96875, "completions/mean_terminated_length": 192.96875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3588130474090576, "epoch": 1.482843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.06278643958629222, "kl": 0.12569394707679749, "learning_rate": 5.987239192014335e-07, "loss": 0.0013, "num_tokens": 38314995.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6949001550674438, "sampling/importance_sampling_ratio/mean": 0.9996659755706787, "sampling/importance_sampling_ratio/min": 0.4953877627849579, "sampling/sampling_logp_difference/max": 0.7024144530296326, "sampling/sampling_logp_difference/mean": 0.019107583910226822, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 189.609375, "completions/mean_terminated_length": 189.609375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3276250958442688, "epoch": 1.4840686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.3275975574656989, "kl": 0.16689805686473846, "learning_rate": 5.980254642796226e-07, "loss": -0.0044, "num_tokens": 38344090.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5865861177444458, "sampling/importance_sampling_ratio/mean": 0.9994770288467407, "sampling/importance_sampling_ratio/min": 0.3266303837299347, "sampling/sampling_logp_difference/max": 1.1189260482788086, "sampling/sampling_logp_difference/mean": 0.015126791782677174, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 152.296875, "completions/mean_terminated_length": 152.296875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.28168967366218567, "epoch": 1.4852941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 1.3328207477552263, "kl": 0.13398806750774384, "learning_rate": 5.973268103723293e-07, "loss": 0.0074, "num_tokens": 38368861.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8144325017929077, "sampling/importance_sampling_ratio/mean": 1.0001928806304932, "sampling/importance_sampling_ratio/min": 0.6089951395988464, "sampling/sampling_logp_difference/max": 0.5957727432250977, "sampling/sampling_logp_difference/mean": 0.01566866785287857, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 325.59375, "completions/mean_terminated_length": 325.59375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3338436782360077, "epoch": 1.4865196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.03745037643506664, "kl": 0.06090451776981354, "learning_rate": 5.966279588977766e-07, "loss": 0.0006, "num_tokens": 38410163.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7999895811080933, "sampling/importance_sampling_ratio/mean": 0.9999022483825684, "sampling/importance_sampling_ratio/min": 0.5719600319862366, "sampling/sampling_logp_difference/max": 0.5877808928489685, "sampling/sampling_logp_difference/mean": 0.014957626350224018, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 246.15625, "completions/mean_terminated_length": 246.15625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.27992749214172363, "epoch": 1.4877450980392157, "frac_reward_zero_std": 0.25, "grad_norm": 1.872454806566852, "kl": 0.09845226258039474, "learning_rate": 5.959289112745891e-07, "loss": 0.0565, "num_tokens": 38444045.0, "reward": 0.71875, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999333620071411, "sampling/importance_sampling_ratio/min": 0.35846590995788574, "sampling/sampling_logp_difference/max": 1.0259218215942383, "sampling/sampling_logp_difference/mean": 0.013677786104381084, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 239.796875, "completions/mean_terminated_length": 239.796875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.26544874906539917, "epoch": 1.4889705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.7343358666089387, "kl": 0.136094868183136, "learning_rate": 5.952296689217889e-07, "loss": -0.0005, "num_tokens": 38475248.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7804735898971558, "sampling/importance_sampling_ratio/mean": 1.0001195669174194, "sampling/importance_sampling_ratio/min": 0.42388060688972473, "sampling/sampling_logp_difference/max": 0.858303427696228, "sampling/sampling_logp_difference/mean": 0.014023521915078163, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 147.109375, "completions/mean_terminated_length": 147.109375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.17752957344055176, "epoch": 1.4901960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.05915417617703328, "kl": 0.08745570480823517, "learning_rate": 5.945302332587938e-07, "loss": 0.0009, "num_tokens": 38501255.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002703666687012, "sampling/importance_sampling_ratio/min": 0.5038222074508667, "sampling/sampling_logp_difference/max": 0.8376307487487793, "sampling/sampling_logp_difference/mean": 0.012213384732604027, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2942553460597992, "epoch": 1.491421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.3111210153575885, "kl": 0.1029469296336174, "learning_rate": 5.938306057054138e-07, "loss": -0.0338, "num_tokens": 38531711.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000220775604248, "sampling/importance_sampling_ratio/min": 0.6058831810951233, "sampling/sampling_logp_difference/max": 0.8302359580993652, "sampling/sampling_logp_difference/mean": 0.013989787548780441, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.20770685374736786, "epoch": 1.4926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.02846568663031406, "kl": 0.059119801968336105, "learning_rate": 5.931307876818487e-07, "loss": 0.0005, "num_tokens": 38567207.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5212385654449463, "sampling/importance_sampling_ratio/mean": 1.000232458114624, "sampling/importance_sampling_ratio/min": 0.6117788553237915, "sampling/sampling_logp_difference/max": 0.49138450622558594, "sampling/sampling_logp_difference/mean": 0.01106779370456934, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 206.28125, "completions/mean_terminated_length": 206.28125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.29796284437179565, "epoch": 1.4938725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 1.648576252755327, "kl": 0.20413929224014282, "learning_rate": 5.924307806086843e-07, "loss": 0.015, "num_tokens": 38596681.0, "reward": 0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6358323097229004, "sampling/importance_sampling_ratio/mean": 1.000525951385498, "sampling/importance_sampling_ratio/min": 0.5990678668022156, "sampling/sampling_logp_difference/max": 0.5123803615570068, "sampling/sampling_logp_difference/mean": 0.015031736344099045, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 192.0625, "completions/mean_terminated_length": 192.0625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.20849837362766266, "epoch": 1.4950980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.050280758429565336, "kl": 0.08213002979755402, "learning_rate": 5.917305859068911e-07, "loss": 0.0008, "num_tokens": 38626109.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5541421175003052, "sampling/importance_sampling_ratio/mean": 1.0002679824829102, "sampling/importance_sampling_ratio/min": 0.5370172262191772, "sampling/sampling_logp_difference/max": 0.6217250823974609, "sampling/sampling_logp_difference/mean": 0.011642636731266975, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 238.890625, "completions/mean_terminated_length": 238.890625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.22361789643764496, "epoch": 1.4963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.04288308873635731, "kl": 0.08960422873497009, "learning_rate": 5.910302049978199e-07, "loss": 0.0009, "num_tokens": 38658694.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5146970748901367, "sampling/importance_sampling_ratio/mean": 0.9996353387832642, "sampling/importance_sampling_ratio/min": 0.6171490550041199, "sampling/sampling_logp_difference/max": 0.4826446771621704, "sampling/sampling_logp_difference/mean": 0.012359712272882462, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 196.140625, "completions/mean_terminated_length": 196.140625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.33020782470703125, "epoch": 1.4975490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 1.7846041729866438, "kl": 0.1470070332288742, "learning_rate": 5.903296393031995e-07, "loss": 0.0156, "num_tokens": 38689423.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9609761238098145, "sampling/importance_sampling_ratio/mean": 0.9995192885398865, "sampling/importance_sampling_ratio/min": 0.16146506369113922, "sampling/sampling_logp_difference/max": 1.8234665393829346, "sampling/sampling_logp_difference/mean": 0.017301514744758606, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 256.671875, "completions/mean_terminated_length": 256.671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.25761643052101135, "epoch": 1.4987745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.03820689698028908, "kl": 0.06466948240995407, "learning_rate": 5.896288902451338e-07, "loss": 0.0006, "num_tokens": 38728570.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.818019151687622, "sampling/importance_sampling_ratio/mean": 0.9998496770858765, "sampling/importance_sampling_ratio/min": 0.4847292900085449, "sampling/sampling_logp_difference/max": 0.7241647243499756, "sampling/sampling_logp_difference/mean": 0.013553157448768616, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 190.65625, "completions/mean_terminated_length": 190.65625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.19422778487205505, "epoch": 1.5, "frac_reward_zero_std": 0.75, "grad_norm": 1.1709296490721015, "kl": 0.07536928355693817, "learning_rate": 5.88927959246099e-07, "loss": -0.0322, "num_tokens": 38756804.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.8888996839523315, "sampling/importance_sampling_ratio/mean": 1.00026273727417, "sampling/importance_sampling_ratio/min": 0.6327093243598938, "sampling/sampling_logp_difference/max": 0.6359944343566895, "sampling/sampling_logp_difference/mean": 0.009969940409064293, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.30658310651779175, "epoch": 1.5012254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.06938694861133382, "kl": 0.11786216497421265, "learning_rate": 5.882268477289408e-07, "loss": 0.0012, "num_tokens": 38790228.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5873552560806274, "sampling/importance_sampling_ratio/mean": 1.00019371509552, "sampling/importance_sampling_ratio/min": 0.5111851096153259, "sampling/sampling_logp_difference/max": 0.6710236072540283, "sampling/sampling_logp_difference/mean": 0.015146405436098576, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 268.1875, "completions/mean_terminated_length": 268.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.25314539670944214, "epoch": 1.5024509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.035721603474821256, "kl": 0.06015956401824951, "learning_rate": 5.875255571168709e-07, "loss": 0.0006, "num_tokens": 38823360.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6903550624847412, "sampling/importance_sampling_ratio/mean": 1.0005155801773071, "sampling/importance_sampling_ratio/min": 0.24656350910663605, "sampling/sampling_logp_difference/max": 1.4001357555389404, "sampling/sampling_logp_difference/mean": 0.013511475175619125, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.228925883769989, "epoch": 1.5036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.04026923665129843, "kl": 0.08304295688867569, "learning_rate": 5.868240888334652e-07, "loss": 0.0008, "num_tokens": 38851480.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6771751642227173, "sampling/importance_sampling_ratio/mean": 1.0003247261047363, "sampling/importance_sampling_ratio/min": 0.6547366380691528, "sampling/sampling_logp_difference/max": 0.5171109437942505, "sampling/sampling_logp_difference/mean": 0.013526812195777893, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 269.734375, "completions/mean_terminated_length": 269.734375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3207527995109558, "epoch": 1.5049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.11007261581515085, "kl": 0.08292139321565628, "learning_rate": 5.861224443026595e-07, "loss": 0.0009, "num_tokens": 38888967.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7183313369750977, "sampling/importance_sampling_ratio/mean": 1.0000386238098145, "sampling/importance_sampling_ratio/min": 0.5618218183517456, "sampling/sampling_logp_difference/max": 0.5765705108642578, "sampling/sampling_logp_difference/mean": 0.015759730711579323, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 217.859375, "completions/mean_terminated_length": 217.859375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.27605897188186646, "epoch": 1.5061274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.009356334476998, "kl": 0.11253257095813751, "learning_rate": 5.854206249487478e-07, "loss": -0.0152, "num_tokens": 38917854.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.9505116939544678, "sampling/importance_sampling_ratio/mean": 0.9997677803039551, "sampling/importance_sampling_ratio/min": 0.38817811012268066, "sampling/sampling_logp_difference/max": 0.9462909698486328, "sampling/sampling_logp_difference/mean": 0.013493069447577, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 240.109375, "completions/mean_terminated_length": 240.109375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.21764399111270905, "epoch": 1.5073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.07106719898330048, "kl": 0.08413289487361908, "learning_rate": 5.847186321963792e-07, "loss": 0.0008, "num_tokens": 38951813.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6368263959884644, "sampling/importance_sampling_ratio/mean": 0.9997113943099976, "sampling/importance_sampling_ratio/min": 0.5479965806007385, "sampling/sampling_logp_difference/max": 0.6014862060546875, "sampling/sampling_logp_difference/mean": 0.01225600577890873, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 216.34375, "completions/mean_terminated_length": 216.34375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.18998324871063232, "epoch": 1.508578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0487005525945726, "kl": 0.0694509744644165, "learning_rate": 5.840164674705542e-07, "loss": 0.0006, "num_tokens": 38984347.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.951185703277588, "sampling/importance_sampling_ratio/mean": 0.9997248649597168, "sampling/importance_sampling_ratio/min": 0.3915136456489563, "sampling/sampling_logp_difference/max": 0.937734842300415, "sampling/sampling_logp_difference/mean": 0.012988218106329441, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 219.5625, "completions/mean_terminated_length": 219.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.26098933815956116, "epoch": 1.5098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.10252765873181363, "kl": 0.09128749370574951, "learning_rate": 5.833141321966228e-07, "loss": 0.0009, "num_tokens": 39019167.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5953713655471802, "sampling/importance_sampling_ratio/mean": 0.9997473359107971, "sampling/importance_sampling_ratio/min": 0.13503998517990112, "sampling/sampling_logp_difference/max": 2.0021843910217285, "sampling/sampling_logp_difference/mean": 0.01455174945294857, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 194.484375, "completions/mean_terminated_length": 194.484375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.18635328114032745, "epoch": 1.5110294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.048930907499501734, "kl": 0.08086130768060684, "learning_rate": 5.826116278002813e-07, "loss": 0.0008, "num_tokens": 39046158.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9421849250793457, "sampling/importance_sampling_ratio/mean": 1.0004785060882568, "sampling/importance_sampling_ratio/min": 0.587902307510376, "sampling/sampling_logp_difference/max": 0.663813591003418, "sampling/sampling_logp_difference/mean": 0.011267730966210365, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.32486265897750854, "epoch": 1.5122549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.4183870647964283, "kl": 0.09388802945613861, "learning_rate": 5.819089557075688e-07, "loss": 0.0367, "num_tokens": 39081578.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002061128616333, "sampling/importance_sampling_ratio/min": 0.32993054389953613, "sampling/sampling_logp_difference/max": 1.1088731288909912, "sampling/sampling_logp_difference/mean": 0.016056686639785767, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.23978698253631592, "epoch": 1.5134803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.4485751012632275, "kl": 0.15217433869838715, "learning_rate": 5.812061173448654e-07, "loss": -0.0047, "num_tokens": 39116690.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.624997854232788, "sampling/importance_sampling_ratio/mean": 1.0002484321594238, "sampling/importance_sampling_ratio/min": 0.5305609107017517, "sampling/sampling_logp_difference/max": 0.6338205337524414, "sampling/sampling_logp_difference/mean": 0.013219879940152168, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 291.484375, "completions/mean_terminated_length": 291.484375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.370257705450058, "epoch": 1.5147058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.2805301030228824, "kl": 0.1061616837978363, "learning_rate": 5.805031141388883e-07, "loss": 0.0032, "num_tokens": 39156513.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 0.9997502565383911, "sampling/importance_sampling_ratio/min": 0.528445839881897, "sampling/sampling_logp_difference/max": 0.637814998626709, "sampling/sampling_logp_difference/mean": 0.01492888294160366, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.21345660090446472, "epoch": 1.5159313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.02876361581332321, "kl": 0.0695776715874672, "learning_rate": 5.797999475166896e-07, "loss": 0.0006, "num_tokens": 39203513.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4442031383514404, "sampling/importance_sampling_ratio/mean": 0.9994962215423584, "sampling/importance_sampling_ratio/min": 0.42204251885414124, "sampling/sampling_logp_difference/max": 0.8626492023468018, "sampling/sampling_logp_difference/mean": 0.012082409113645554, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 207.71875, "completions/mean_terminated_length": 207.71875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3437170386314392, "epoch": 1.517156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.0526115728837453, "kl": 0.1577879637479782, "learning_rate": 5.790966189056529e-07, "loss": 0.0024, "num_tokens": 39236583.0, "reward": -0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8864585161209106, "sampling/importance_sampling_ratio/mean": 0.9998160004615784, "sampling/importance_sampling_ratio/min": 0.6117802262306213, "sampling/sampling_logp_difference/max": 0.6347012519836426, "sampling/sampling_logp_difference/mean": 0.016931496560573578, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.30623072385787964, "epoch": 1.5183823529411766, "frac_reward_zero_std": 0.25, "grad_norm": 2.5214898330188613, "kl": 0.15095746517181396, "learning_rate": 5.783931297334907e-07, "loss": 0.0384, "num_tokens": 39273879.0, "reward": 0.59375, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.974285364151001, "sampling/importance_sampling_ratio/mean": 0.9998407959938049, "sampling/importance_sampling_ratio/min": 0.5403859615325928, "sampling/sampling_logp_difference/max": 0.6802065372467041, "sampling/sampling_logp_difference/mean": 0.015204276889562607, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 257.484375, "completions/mean_terminated_length": 257.484375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.26164013147354126, "epoch": 1.5196078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.042319742710021084, "kl": 0.10378521680831909, "learning_rate": 5.776894814282415e-07, "loss": 0.0008, "num_tokens": 39307622.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.574885606765747, "sampling/importance_sampling_ratio/mean": 0.999725878238678, "sampling/importance_sampling_ratio/min": 0.6167843341827393, "sampling/sampling_logp_difference/max": 0.48323583602905273, "sampling/sampling_logp_difference/mean": 0.013524348847568035, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 199.984375, "completions/mean_terminated_length": 199.984375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.288371205329895, "epoch": 1.5208333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.1808160531731653, "kl": 0.1048680916428566, "learning_rate": 5.769856754182667e-07, "loss": -0.0152, "num_tokens": 39339237.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5963785648345947, "sampling/importance_sampling_ratio/mean": 0.9998101592063904, "sampling/importance_sampling_ratio/min": 0.629842221736908, "sampling/sampling_logp_difference/max": 0.46773767471313477, "sampling/sampling_logp_difference/mean": 0.01503780297935009, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 228.203125, "completions/mean_terminated_length": 228.203125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2760133743286133, "epoch": 1.5220588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.1542302053264695, "kl": 0.08837348222732544, "learning_rate": 5.762817131322481e-07, "loss": -0.0054, "num_tokens": 39370978.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9127037525177002, "sampling/importance_sampling_ratio/mean": 0.9995803833007812, "sampling/importance_sampling_ratio/min": 0.3032958507537842, "sampling/sampling_logp_difference/max": 1.1930465698242188, "sampling/sampling_logp_difference/mean": 0.01568417251110077, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 220.234375, "completions/mean_terminated_length": 220.234375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2781566083431244, "epoch": 1.5232843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 4.149632577200551, "kl": 0.1296968013048172, "learning_rate": 5.755775959991844e-07, "loss": 0.015, "num_tokens": 39402817.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6944935321807861, "sampling/importance_sampling_ratio/mean": 0.9998199939727783, "sampling/importance_sampling_ratio/min": 0.04180217534303665, "sampling/sampling_logp_difference/max": 3.174806833267212, "sampling/sampling_logp_difference/mean": 0.013846222311258316, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 147.9375, "completions/mean_terminated_length": 147.9375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.26521608233451843, "epoch": 1.5245098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.3353712233418034, "kl": 0.12796512246131897, "learning_rate": 5.74873325448389e-07, "loss": -0.0034, "num_tokens": 39427325.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6949468851089478, "sampling/importance_sampling_ratio/mean": 0.9996969699859619, "sampling/importance_sampling_ratio/min": 0.6048211455345154, "sampling/sampling_logp_difference/max": 0.5276514291763306, "sampling/sampling_logp_difference/mean": 0.016097765415906906, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 217.703125, "completions/mean_terminated_length": 217.703125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.23921138048171997, "epoch": 1.5257352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.7403096642705569, "kl": 0.09272611141204834, "learning_rate": 5.741689029094861e-07, "loss": -0.0081, "num_tokens": 39457690.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.7083966732025146, "sampling/importance_sampling_ratio/mean": 1.000200629234314, "sampling/importance_sampling_ratio/min": 0.6069169044494629, "sampling/sampling_logp_difference/max": 0.535555362701416, "sampling/sampling_logp_difference/mean": 0.011944685131311417, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 199.59375, "completions/mean_terminated_length": 199.59375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2644798159599304, "epoch": 1.5269607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.5442020790508673, "kl": 0.10834425687789917, "learning_rate": 5.73464329812409e-07, "loss": 0.001, "num_tokens": 39485184.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6782118082046509, "sampling/importance_sampling_ratio/mean": 0.9999837875366211, "sampling/importance_sampling_ratio/min": 0.5354776978492737, "sampling/sampling_logp_difference/max": 0.6245959997177124, "sampling/sampling_logp_difference/mean": 0.01373962964862585, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 221.21875, "completions/mean_terminated_length": 221.21875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.22620631754398346, "epoch": 1.528186274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.3353967451302322, "kl": 0.09754102677106857, "learning_rate": 5.727596075873965e-07, "loss": 0.0354, "num_tokens": 39514190.0, "reward": 0.4375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6716618537902832, "sampling/importance_sampling_ratio/mean": 0.9996615648269653, "sampling/importance_sampling_ratio/min": 0.5687833428382874, "sampling/sampling_logp_difference/max": 0.5642556548118591, "sampling/sampling_logp_difference/mean": 0.011098390445113182, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 193.03125, "completions/mean_terminated_length": 193.03125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2537536025047302, "epoch": 1.5294117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 2.105612797903851, "kl": 0.18153579533100128, "learning_rate": 5.7205473766499e-07, "loss": 0.0515, "num_tokens": 39546144.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009520053863525, "sampling/importance_sampling_ratio/min": 0.6239134669303894, "sampling/sampling_logp_difference/max": 0.807680606842041, "sampling/sampling_logp_difference/mean": 0.014567977748811245, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 169.390625, "completions/mean_terminated_length": 169.390625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.23550954461097717, "epoch": 1.530637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.09223664492514, "kl": 0.1612507551908493, "learning_rate": 5.71349721476031e-07, "loss": -0.0081, "num_tokens": 39577081.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5416961908340454, "sampling/importance_sampling_ratio/mean": 0.9995707869529724, "sampling/importance_sampling_ratio/min": 0.6337987780570984, "sampling/sampling_logp_difference/max": 0.45602381229400635, "sampling/sampling_logp_difference/mean": 0.0124875633046031, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35919585824012756, "epoch": 1.531862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.2726428466658852, "kl": 0.13468556106090546, "learning_rate": 5.706445604516574e-07, "loss": 0.0094, "num_tokens": 39620777.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.846413254737854, "sampling/importance_sampling_ratio/mean": 0.9995013475418091, "sampling/importance_sampling_ratio/min": 0.36016330122947693, "sampling/sampling_logp_difference/max": 1.02119779586792, "sampling/sampling_logp_difference/mean": 0.018313605338335037, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 236.328125, "completions/mean_terminated_length": 236.328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2343226969242096, "epoch": 1.5330882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.3112108523889763, "kl": 0.09341298043727875, "learning_rate": 5.699392560233017e-07, "loss": 0.0829, "num_tokens": 39653214.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4505730867385864, "sampling/importance_sampling_ratio/mean": 0.9998184442520142, "sampling/importance_sampling_ratio/min": 0.4843129515647888, "sampling/sampling_logp_difference/max": 0.7250239849090576, "sampling/sampling_logp_difference/mean": 0.013272561132907867, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.23711419105529785, "epoch": 1.534313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.111938035878301, "kl": 0.10805274546146393, "learning_rate": 5.69233809622687e-07, "loss": 0.0112, "num_tokens": 39685150.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5494507551193237, "sampling/importance_sampling_ratio/mean": 0.9999529123306274, "sampling/importance_sampling_ratio/min": 0.5428563356399536, "sampling/sampling_logp_difference/max": 0.6109106540679932, "sampling/sampling_logp_difference/mean": 0.012215834110975266, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 202.34375, "completions/mean_terminated_length": 202.34375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2381390482187271, "epoch": 1.5355392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.6833653141312037, "kl": 0.0950554683804512, "learning_rate": 5.685282226818249e-07, "loss": 0.0177, "num_tokens": 39719092.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999245405197144, "sampling/importance_sampling_ratio/min": 0.5272694826126099, "sampling/sampling_logp_difference/max": 1.5067219734191895, "sampling/sampling_logp_difference/mean": 0.013562340289354324, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 208.296875, "completions/mean_terminated_length": 208.296875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.34556320309638977, "epoch": 1.5367647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.7188936050742745, "kl": 0.12145892530679703, "learning_rate": 5.678224966330119e-07, "loss": 0.0149, "num_tokens": 39753495.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6007500886917114, "sampling/importance_sampling_ratio/mean": 0.9999268054962158, "sampling/importance_sampling_ratio/min": 0.5603798031806946, "sampling/sampling_logp_difference/max": 0.5791404247283936, "sampling/sampling_logp_difference/mean": 0.016672348603606224, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 177.609375, "completions/mean_terminated_length": 177.609375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.23078593611717224, "epoch": 1.5379901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.06222337763973542, "kl": 0.08189937472343445, "learning_rate": 5.671166329088277e-07, "loss": 0.0008, "num_tokens": 39780702.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6662049293518066, "sampling/importance_sampling_ratio/mean": 1.0002987384796143, "sampling/importance_sampling_ratio/min": 0.47876015305519104, "sampling/sampling_logp_difference/max": 0.7365555167198181, "sampling/sampling_logp_difference/mean": 0.013800526969134808, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 243.78125, "completions/mean_terminated_length": 243.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2520521283149719, "epoch": 1.5392156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.2861153798710994, "kl": 0.06537686288356781, "learning_rate": 5.664106329421305e-07, "loss": -0.0495, "num_tokens": 39814560.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.646364450454712, "sampling/importance_sampling_ratio/mean": 1.0000898838043213, "sampling/importance_sampling_ratio/min": 0.5492440462112427, "sampling/sampling_logp_difference/max": 0.5992124080657959, "sampling/sampling_logp_difference/mean": 0.012905338779091835, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 215.3125, "completions/mean_terminated_length": 215.3125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2798084616661072, "epoch": 1.5404411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.062002073670821556, "kl": 0.09289909899234772, "learning_rate": 5.657044981660559e-07, "loss": 0.0009, "num_tokens": 39848468.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5731983184814453, "sampling/importance_sampling_ratio/mean": 0.9994695782661438, "sampling/importance_sampling_ratio/min": 0.5772298574447632, "sampling/sampling_logp_difference/max": 0.5495147705078125, "sampling/sampling_logp_difference/mean": 0.014737410470843315, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 167.984375, "completions/mean_terminated_length": 167.984375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.23264190554618835, "epoch": 1.5416666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.10382213039474375, "kl": 0.10797363519668579, "learning_rate": 5.649982300140123e-07, "loss": 0.0011, "num_tokens": 39876115.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8272643089294434, "sampling/importance_sampling_ratio/mean": 0.999854564666748, "sampling/importance_sampling_ratio/min": 0.585029125213623, "sampling/sampling_logp_difference/max": 0.6028199195861816, "sampling/sampling_logp_difference/mean": 0.013319024816155434, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 173.421875, "completions/mean_terminated_length": 173.421875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.23104506731033325, "epoch": 1.5428921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.384033702678028, "kl": 0.08499298989772797, "learning_rate": 5.642918299196796e-07, "loss": -0.0176, "num_tokens": 39902078.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6162153482437134, "sampling/importance_sampling_ratio/mean": 1.0007696151733398, "sampling/importance_sampling_ratio/min": 0.621555745601654, "sampling/sampling_logp_difference/max": 0.4800872802734375, "sampling/sampling_logp_difference/mean": 0.012712078168988228, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.20366084575653076, "epoch": 1.5441176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.6610212922375305, "kl": 0.11102088540792465, "learning_rate": 5.635852993170052e-07, "loss": 0.0038, "num_tokens": 39925158.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.630755066871643, "sampling/importance_sampling_ratio/mean": 1.000718116760254, "sampling/importance_sampling_ratio/min": 0.47591328620910645, "sampling/sampling_logp_difference/max": 0.7425196170806885, "sampling/sampling_logp_difference/mean": 0.013576138764619827, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 240.703125, "completions/mean_terminated_length": 240.703125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2457953691482544, "epoch": 1.545343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.5482759188590858, "kl": 0.07239291071891785, "learning_rate": 5.628786396402013e-07, "loss": -0.017, "num_tokens": 39959203.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 1.000040888786316, "sampling/importance_sampling_ratio/min": 0.4159519374370575, "sampling/sampling_logp_difference/max": 0.877185583114624, "sampling/sampling_logp_difference/mean": 0.013064755126833916, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 129.765625, "completions/mean_terminated_length": 129.765625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.19108673930168152, "epoch": 1.5465686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.09128008485690275, "kl": 0.0935688316822052, "learning_rate": 5.621718523237426e-07, "loss": 0.0009, "num_tokens": 39983556.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002806186676025, "sampling/importance_sampling_ratio/min": 0.6650000214576721, "sampling/sampling_logp_difference/max": 0.771212100982666, "sampling/sampling_logp_difference/mean": 0.011321095749735832, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 215.4375, "completions/mean_terminated_length": 215.4375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.23867498338222504, "epoch": 1.5477941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.9279048547268585, "kl": 0.09833469241857529, "learning_rate": 5.614649388023622e-07, "loss": -0.0002, "num_tokens": 40014288.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6091666221618652, "sampling/importance_sampling_ratio/mean": 0.9999319314956665, "sampling/importance_sampling_ratio/min": 0.549793004989624, "sampling/sampling_logp_difference/max": 0.5982134342193604, "sampling/sampling_logp_difference/mean": 0.01338406465947628, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 193.109375, "completions/mean_terminated_length": 193.109375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.21172797679901123, "epoch": 1.5490196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.3127184525500883, "kl": 0.06100928783416748, "learning_rate": 5.607579005110502e-07, "loss": 0.0006, "num_tokens": 40041975.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9005506038665771, "sampling/importance_sampling_ratio/mean": 1.0003749132156372, "sampling/importance_sampling_ratio/min": 0.29519444704055786, "sampling/sampling_logp_difference/max": 1.220120906829834, "sampling/sampling_logp_difference/mean": 0.012064306065440178, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 169.578125, "completions/mean_terminated_length": 169.578125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2743019461631775, "epoch": 1.5502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.4961160389061916, "kl": 0.10028813779354095, "learning_rate": 5.60050738885049e-07, "loss": 0.0185, "num_tokens": 40070188.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997214674949646, "sampling/importance_sampling_ratio/min": 0.4946571886539459, "sampling/sampling_logp_difference/max": 0.7074887752532959, "sampling/sampling_logp_difference/mean": 0.01720958948135376, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 182.734375, "completions/mean_terminated_length": 182.734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3171706795692444, "epoch": 1.5514705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.4790588340111015, "kl": 0.14416320621967316, "learning_rate": 5.593434553598525e-07, "loss": -0.0161, "num_tokens": 40101547.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5588680505752563, "sampling/importance_sampling_ratio/mean": 0.9998143911361694, "sampling/importance_sampling_ratio/min": 0.4989607632160187, "sampling/sampling_logp_difference/max": 0.695227861404419, "sampling/sampling_logp_difference/mean": 0.017059538513422012, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2670314908027649, "epoch": 1.5526960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.3274914775166644, "kl": 0.08601544052362442, "learning_rate": 5.586360513712009e-07, "loss": 0.0017, "num_tokens": 40129915.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000174641609192, "sampling/importance_sampling_ratio/min": 0.5666781067848206, "sampling/sampling_logp_difference/max": 0.7321939468383789, "sampling/sampling_logp_difference/mean": 0.014580268412828445, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 218.40625, "completions/mean_terminated_length": 218.40625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.28639858961105347, "epoch": 1.553921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.0743242514633502, "kl": 0.0894109308719635, "learning_rate": 5.579285283550797e-07, "loss": -0.0179, "num_tokens": 40163701.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.9880058765411377, "sampling/importance_sampling_ratio/mean": 1.0004611015319824, "sampling/importance_sampling_ratio/min": 0.5505126714706421, "sampling/sampling_logp_difference/max": 0.6871321201324463, "sampling/sampling_logp_difference/mean": 0.01633625663816929, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 163.984375, "completions/mean_terminated_length": 163.984375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.23159533739089966, "epoch": 1.5551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.3265368422339745, "kl": 0.1146799698472023, "learning_rate": 5.572208877477159e-07, "loss": -0.0101, "num_tokens": 40193972.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4586268663406372, "sampling/importance_sampling_ratio/mean": 0.9998776912689209, "sampling/importance_sampling_ratio/min": 0.6181763410568237, "sampling/sampling_logp_difference/max": 0.48098158836364746, "sampling/sampling_logp_difference/mean": 0.013385292142629623, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 128.265625, "completions/mean_terminated_length": 128.265625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2298588752746582, "epoch": 1.5563725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.1456240778612018, "kl": 0.11662641167640686, "learning_rate": 5.565131309855752e-07, "loss": 0.0011, "num_tokens": 40220181.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.695549726486206, "sampling/importance_sampling_ratio/mean": 0.9998950958251953, "sampling/importance_sampling_ratio/min": 0.3276721239089966, "sampling/sampling_logp_difference/max": 1.1157417297363281, "sampling/sampling_logp_difference/mean": 0.015842556953430176, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 213.21875, "completions/mean_terminated_length": 213.21875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.26580965518951416, "epoch": 1.5575980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.9841841882140552, "kl": 0.10150258243083954, "learning_rate": 5.558052595053586e-07, "loss": -0.0212, "num_tokens": 40257843.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993384480476379, "sampling/importance_sampling_ratio/min": 0.23381738364696503, "sampling/sampling_logp_difference/max": 1.4532148838043213, "sampling/sampling_logp_difference/mean": 0.01650913991034031, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 211.765625, "completions/mean_terminated_length": 211.765625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.22360765933990479, "epoch": 1.5588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.09318137774395309, "kl": 0.07635696977376938, "learning_rate": 5.550972747440005e-07, "loss": 0.0007, "num_tokens": 40285636.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8329914808273315, "sampling/importance_sampling_ratio/mean": 1.0001411437988281, "sampling/importance_sampling_ratio/min": 0.568603515625, "sampling/sampling_logp_difference/max": 0.6059494018554688, "sampling/sampling_logp_difference/mean": 0.012909941375255585, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 238.90625, "completions/mean_terminated_length": 238.90625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3557601571083069, "epoch": 1.5600490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 1.56944518339622, "kl": 0.11926597356796265, "learning_rate": 5.543891781386655e-07, "loss": -0.0501, "num_tokens": 40326462.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.8657264709472656, "sampling/importance_sampling_ratio/mean": 1.0002565383911133, "sampling/importance_sampling_ratio/min": 0.46485185623168945, "sampling/sampling_logp_difference/max": 0.7660365104675293, "sampling/sampling_logp_difference/mean": 0.016536138951778412, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 192.703125, "completions/mean_terminated_length": 192.703125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.21355998516082764, "epoch": 1.5612745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.042234641612648, "kl": 0.08831916004419327, "learning_rate": 5.536809711267443e-07, "loss": -0.011, "num_tokens": 40354635.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8180792331695557, "sampling/importance_sampling_ratio/mean": 1.0000646114349365, "sampling/importance_sampling_ratio/min": 0.5598354339599609, "sampling/sampling_logp_difference/max": 0.5977805852890015, "sampling/sampling_logp_difference/mean": 0.01221003569662571, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 194.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2758662700653076, "epoch": 1.5625, "frac_reward_zero_std": 0.75, "grad_norm": 1.4188679796807888, "kl": 0.10724423825740814, "learning_rate": 5.529726551458526e-07, "loss": 0.01, "num_tokens": 40387651.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8446329832077026, "sampling/importance_sampling_ratio/mean": 0.9991621375083923, "sampling/importance_sampling_ratio/min": 0.4571169912815094, "sampling/sampling_logp_difference/max": 0.7828159332275391, "sampling/sampling_logp_difference/mean": 0.016088467091321945, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 223.421875, "completions/mean_terminated_length": 223.421875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2723897695541382, "epoch": 1.5637254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.1141962541942754, "kl": 0.08754530549049377, "learning_rate": 5.522642316338268e-07, "loss": -0.0267, "num_tokens": 40419886.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003609657287598, "sampling/importance_sampling_ratio/min": 0.32915252447128296, "sampling/sampling_logp_difference/max": 1.1112340688705444, "sampling/sampling_logp_difference/mean": 0.01671537756919861, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 158.203125, "completions/mean_terminated_length": 158.203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2390233278274536, "epoch": 1.5649509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.0850297375053766, "kl": 0.11868917942047119, "learning_rate": 5.515557020287218e-07, "loss": 0.0012, "num_tokens": 40445563.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001310110092163, "sampling/importance_sampling_ratio/min": 0.619220495223999, "sampling/sampling_logp_difference/max": 0.8923323154449463, "sampling/sampling_logp_difference/mean": 0.014304285869002342, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 217.515625, "completions/mean_terminated_length": 217.515625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2527675926685333, "epoch": 1.5661764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.002817954155109, "kl": 0.10649637132883072, "learning_rate": 5.508470677688078e-07, "loss": -0.0023, "num_tokens": 40478076.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.821816086769104, "sampling/importance_sampling_ratio/mean": 1.0001246929168701, "sampling/importance_sampling_ratio/min": 0.6048331260681152, "sampling/sampling_logp_difference/max": 0.5998338460922241, "sampling/sampling_logp_difference/mean": 0.014033079147338867, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 191.71875, "completions/mean_terminated_length": 191.71875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3498360812664032, "epoch": 1.5674019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.817895853693584, "kl": 0.14529192447662354, "learning_rate": 5.501383302925677e-07, "loss": 0.0287, "num_tokens": 40513050.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8661153316497803, "sampling/importance_sampling_ratio/mean": 1.000024437904358, "sampling/importance_sampling_ratio/min": 0.6282871961593628, "sampling/sampling_logp_difference/max": 0.6238589286804199, "sampling/sampling_logp_difference/mean": 0.01771906018257141, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 237.015625, "completions/mean_terminated_length": 237.015625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.29353150725364685, "epoch": 1.5686274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.22491306039810763, "kl": 0.09252274036407471, "learning_rate": 5.494294910386933e-07, "loss": 0.001, "num_tokens": 40551275.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5715757608413696, "sampling/importance_sampling_ratio/mean": 1.0004040002822876, "sampling/importance_sampling_ratio/min": 0.4678581655025482, "sampling/sampling_logp_difference/max": 0.7595901489257812, "sampling/sampling_logp_difference/mean": 0.01582920365035534, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 129.234375, "completions/mean_terminated_length": 129.234375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.23700666427612305, "epoch": 1.5698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.9577397856869958, "kl": 0.12990131974220276, "learning_rate": 5.487205514460835e-07, "loss": -0.0058, "num_tokens": 40576010.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6820436716079712, "sampling/importance_sampling_ratio/mean": 1.0001105070114136, "sampling/importance_sampling_ratio/min": 0.5166517496109009, "sampling/sampling_logp_difference/max": 0.6603862643241882, "sampling/sampling_logp_difference/mean": 0.015840716660022736, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 163.578125, "completions/mean_terminated_length": 163.578125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.20090463757514954, "epoch": 1.571078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.05936568691447249, "kl": 0.07548060268163681, "learning_rate": 5.480115129538409e-07, "loss": 0.0008, "num_tokens": 40605375.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.598211407661438, "sampling/importance_sampling_ratio/mean": 0.9995618462562561, "sampling/importance_sampling_ratio/min": 0.4885278046131134, "sampling/sampling_logp_difference/max": 0.7163589000701904, "sampling/sampling_logp_difference/mean": 0.012459134683012962, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 168.65625, "completions/mean_terminated_length": 168.65625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23894764482975006, "epoch": 1.5723039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.4373800788773192, "kl": 0.07795411348342896, "learning_rate": 5.473023770012686e-07, "loss": -0.0108, "num_tokens": 40632025.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.959810495376587, "sampling/importance_sampling_ratio/mean": 1.000944972038269, "sampling/importance_sampling_ratio/min": 0.4056454598903656, "sampling/sampling_logp_difference/max": 0.902275800704956, "sampling/sampling_logp_difference/mean": 0.01591445505619049, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 201.109375, "completions/mean_terminated_length": 201.109375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2994091510772705, "epoch": 1.5735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 2.485785633874617, "kl": 0.12637823820114136, "learning_rate": 5.465931450278676e-07, "loss": -0.0054, "num_tokens": 40664384.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6253334283828735, "sampling/importance_sampling_ratio/mean": 0.9995725154876709, "sampling/importance_sampling_ratio/min": 0.5292443037033081, "sampling/sampling_logp_difference/max": 0.6363050937652588, "sampling/sampling_logp_difference/mean": 0.015157992951571941, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 264.28125, "completions/mean_terminated_length": 264.28125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3000125586986542, "epoch": 1.5747549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.1394094039805573, "kl": 0.12689156830310822, "learning_rate": 5.458838184733341e-07, "loss": 0.0098, "num_tokens": 40697474.0, "reward": -0.40625, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8945499658584595, "sampling/importance_sampling_ratio/mean": 1.0002498626708984, "sampling/importance_sampling_ratio/min": 0.5997557044029236, "sampling/sampling_logp_difference/max": 0.6389813423156738, "sampling/sampling_logp_difference/mean": 0.015428531914949417, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 187.046875, "completions/mean_terminated_length": 187.046875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.22485093772411346, "epoch": 1.5759803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.6563343521050014, "kl": 0.06725899875164032, "learning_rate": 5.451743987775559e-07, "loss": 0.0147, "num_tokens": 40728741.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999529719352722, "sampling/importance_sampling_ratio/min": 0.4202858805656433, "sampling/sampling_logp_difference/max": 0.8668200969696045, "sampling/sampling_logp_difference/mean": 0.014246209524571896, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 223.328125, "completions/mean_terminated_length": 223.328125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2730158567428589, "epoch": 1.5772058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.0062192756669384, "kl": 0.10583344101905823, "learning_rate": 5.444648873806101e-07, "loss": -0.0233, "num_tokens": 40758074.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.406072735786438, "sampling/importance_sampling_ratio/mean": 0.9998959302902222, "sampling/importance_sampling_ratio/min": 0.46186599135398865, "sampling/sampling_logp_difference/max": 0.7724804878234863, "sampling/sampling_logp_difference/mean": 0.014256740920245647, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 187.921875, "completions/mean_terminated_length": 187.921875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.28058645129203796, "epoch": 1.5784313725490198, "frac_reward_zero_std": 0.25, "grad_norm": 1.9967848781737365, "kl": 0.09992186725139618, "learning_rate": 5.437552857227597e-07, "loss": 0.0307, "num_tokens": 40786661.0, "reward": 0.59375, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4743789434432983, "sampling/importance_sampling_ratio/mean": 1.0000240802764893, "sampling/importance_sampling_ratio/min": 0.5766149759292603, "sampling/sampling_logp_difference/max": 0.5505805015563965, "sampling/sampling_logp_difference/mean": 0.013886423781514168, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 246.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3286219537258148, "epoch": 1.579656862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.4992127432975813, "kl": 0.11583884805440903, "learning_rate": 5.430455952444512e-07, "loss": 0.046, "num_tokens": 40814549.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6112518310546875, "sampling/importance_sampling_ratio/mean": 0.9998191595077515, "sampling/importance_sampling_ratio/min": 0.5836060047149658, "sampling/sampling_logp_difference/max": 0.5385291576385498, "sampling/sampling_logp_difference/mean": 0.016217375174164772, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 257.609375, "completions/mean_terminated_length": 257.609375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.28263741731643677, "epoch": 1.5808823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.4290550984192945, "kl": 0.10155349224805832, "learning_rate": 5.423358173863116e-07, "loss": 0.0061, "num_tokens": 40848812.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7165179252624512, "sampling/importance_sampling_ratio/mean": 0.9999136328697205, "sampling/importance_sampling_ratio/min": 0.5624218583106995, "sampling/sampling_logp_difference/max": 0.5755031108856201, "sampling/sampling_logp_difference/mean": 0.014271766878664494, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 213.53125, "completions/mean_terminated_length": 213.53125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.24739715456962585, "epoch": 1.5821078431372548, "frac_reward_zero_std": 0.5, "grad_norm": 1.6734622190020279, "kl": 0.09649119526147842, "learning_rate": 5.416259535891446e-07, "loss": 0.04, "num_tokens": 40879598.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995496869087219, "sampling/importance_sampling_ratio/min": 0.3670720160007477, "sampling/sampling_logp_difference/max": 1.0069791078567505, "sampling/sampling_logp_difference/mean": 0.013408919796347618, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 215.5625, "completions/mean_terminated_length": 215.5625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.294612318277359, "epoch": 1.5833333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.8663318814916061, "kl": 0.1407470703125, "learning_rate": 5.409160052939291e-07, "loss": -0.0069, "num_tokens": 40915298.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000393390655518, "sampling/importance_sampling_ratio/min": 0.38391175866127014, "sampling/sampling_logp_difference/max": 0.9573426246643066, "sampling/sampling_logp_difference/mean": 0.01669587939977646, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.25864478945732117, "epoch": 1.5845588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.6736988394049386, "kl": 0.08632921427488327, "learning_rate": 5.402059739418148e-07, "loss": 0.1311, "num_tokens": 40942978.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006288290023804, "sampling/importance_sampling_ratio/min": 0.6288881301879883, "sampling/sampling_logp_difference/max": 0.7209138870239258, "sampling/sampling_logp_difference/mean": 0.014864898286759853, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 260.15625, "completions/mean_terminated_length": 260.15625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.25275641679763794, "epoch": 1.5857843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 1.2194443009765528, "kl": 0.0801507979631424, "learning_rate": 5.394958609741206e-07, "loss": -0.0058, "num_tokens": 40975436.0, "reward": -0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997862577438354, "sampling/importance_sampling_ratio/min": 0.6255087852478027, "sampling/sampling_logp_difference/max": 0.7695990800857544, "sampling/sampling_logp_difference/mean": 0.012196059338748455, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 250.15625, "completions/mean_terminated_length": 250.15625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3004721701145172, "epoch": 1.5870098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.2984374525467763, "kl": 0.1429024338722229, "learning_rate": 5.387856678323307e-07, "loss": 0.0101, "num_tokens": 41009478.0, "reward": 0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5772258043289185, "sampling/importance_sampling_ratio/mean": 1.000220537185669, "sampling/importance_sampling_ratio/min": 0.559878408908844, "sampling/sampling_logp_difference/max": 0.5800356268882751, "sampling/sampling_logp_difference/mean": 0.015601429156959057, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 167.65625, "completions/mean_terminated_length": 167.65625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1998165249824524, "epoch": 1.5882352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.06245472818406181, "kl": 0.0781020000576973, "learning_rate": 5.380753959580922e-07, "loss": 0.0008, "num_tokens": 41049824.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9509587287902832, "sampling/importance_sampling_ratio/mean": 1.0004262924194336, "sampling/importance_sampling_ratio/min": 0.5226418375968933, "sampling/sampling_logp_difference/max": 0.668320894241333, "sampling/sampling_logp_difference/mean": 0.012588824145495892, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 244.046875, "completions/mean_terminated_length": 244.046875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3870052993297577, "epoch": 1.5894607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.1356174564170538, "kl": 0.1631091684103012, "learning_rate": 5.373650467932121e-07, "loss": -0.0164, "num_tokens": 41083331.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6629855632781982, "sampling/importance_sampling_ratio/mean": 1.0000462532043457, "sampling/importance_sampling_ratio/min": 0.6024278998374939, "sampling/sampling_logp_difference/max": 0.5086145401000977, "sampling/sampling_logp_difference/mean": 0.017559468746185303, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 178.03125, "completions/mean_terminated_length": 178.03125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24488691985607147, "epoch": 1.590686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.05664842750606466, "kl": 0.07084640115499496, "learning_rate": 5.366546217796541e-07, "loss": 0.0007, "num_tokens": 41115637.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9388110637664795, "sampling/importance_sampling_ratio/mean": 1.000507116317749, "sampling/importance_sampling_ratio/min": 0.5753761529922485, "sampling/sampling_logp_difference/max": 0.6620749235153198, "sampling/sampling_logp_difference/mean": 0.014486636035144329, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 258.1875, "completions/mean_terminated_length": 258.1875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3009330928325653, "epoch": 1.5919117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.6563035468953373, "kl": 0.0957925021648407, "learning_rate": 5.359441223595363e-07, "loss": -0.0652, "num_tokens": 41152337.0, "reward": 0.5, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6631168127059937, "sampling/importance_sampling_ratio/mean": 0.9993993043899536, "sampling/importance_sampling_ratio/min": 0.5552826523780823, "sampling/sampling_logp_difference/max": 0.5882779359817505, "sampling/sampling_logp_difference/mean": 0.013483626767992973, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 219.859375, "completions/mean_terminated_length": 219.859375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.28012022376060486, "epoch": 1.593137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.5183671380069574, "kl": 0.13128457963466644, "learning_rate": 5.352335499751269e-07, "loss": -0.0276, "num_tokens": 41182680.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.9445205926895142, "sampling/importance_sampling_ratio/mean": 1.0000536441802979, "sampling/importance_sampling_ratio/min": 0.5100371837615967, "sampling/sampling_logp_difference/max": 0.673271656036377, "sampling/sampling_logp_difference/mean": 0.01482788659632206, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 252.359375, "completions/mean_terminated_length": 252.359375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2718067765235901, "epoch": 1.594362745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.5306560736696353, "kl": 0.0820138156414032, "learning_rate": 5.345229060688433e-07, "loss": -0.0155, "num_tokens": 41218367.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6451834440231323, "sampling/importance_sampling_ratio/mean": 0.9999179840087891, "sampling/importance_sampling_ratio/min": 0.5406036972999573, "sampling/sampling_logp_difference/max": 0.6150689125061035, "sampling/sampling_logp_difference/mean": 0.014146863482892513, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 177.34375, "completions/mean_terminated_length": 177.34375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.22269929945468903, "epoch": 1.5955882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.3923351940283724, "kl": 0.07856234908103943, "learning_rate": 5.338121920832475e-07, "loss": 0.0008, "num_tokens": 41247365.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.890487790107727, "sampling/importance_sampling_ratio/mean": 0.9993278980255127, "sampling/importance_sampling_ratio/min": 0.5250731706619263, "sampling/sampling_logp_difference/max": 0.6442176103591919, "sampling/sampling_logp_difference/mean": 0.014698393642902374, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 193.828125, "completions/mean_terminated_length": 193.828125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3115875720977783, "epoch": 1.596813725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.7693350321536974, "kl": 0.1558687388896942, "learning_rate": 5.331014094610438e-07, "loss": 0.0104, "num_tokens": 41275018.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6519043445587158, "sampling/importance_sampling_ratio/mean": 0.9997758865356445, "sampling/importance_sampling_ratio/min": 0.6171634197235107, "sampling/sampling_logp_difference/max": 0.5019288063049316, "sampling/sampling_logp_difference/mean": 0.016039744019508362, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 184.890625, "completions/mean_terminated_length": 184.890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.23528572916984558, "epoch": 1.5980392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.2081387886376294, "kl": 0.0981862023472786, "learning_rate": 5.323905596450759e-07, "loss": 0.0365, "num_tokens": 41304643.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004469156265259, "sampling/importance_sampling_ratio/min": 0.4046587347984314, "sampling/sampling_logp_difference/max": 1.098386287689209, "sampling/sampling_logp_difference/mean": 0.01493294071406126, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 212.859375, "completions/mean_terminated_length": 212.859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.27819180488586426, "epoch": 1.5992647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 1.1589790752226412, "kl": 0.10933251678943634, "learning_rate": 5.31679644078324e-07, "loss": -0.0048, "num_tokens": 41333914.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6308337450027466, "sampling/importance_sampling_ratio/mean": 0.99986732006073, "sampling/importance_sampling_ratio/min": 0.5278540849685669, "sampling/sampling_logp_difference/max": 0.6389353275299072, "sampling/sampling_logp_difference/mean": 0.014791795052587986, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 180.953125, "completions/mean_terminated_length": 180.953125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.19707581400871277, "epoch": 1.6004901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.07642399662364885, "kl": 0.09464478492736816, "learning_rate": 5.309686642039015e-07, "loss": 0.0009, "num_tokens": 41361063.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.62743079662323, "sampling/importance_sampling_ratio/mean": 1.0002444982528687, "sampling/importance_sampling_ratio/min": 0.41710734367370605, "sampling/sampling_logp_difference/max": 0.8744117021560669, "sampling/sampling_logp_difference/mean": 0.013895422220230103, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 202.453125, "completions/mean_terminated_length": 202.453125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.20806419849395752, "epoch": 1.6017156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.04285113876154974, "kl": 0.07179662585258484, "learning_rate": 5.302576214650527e-07, "loss": 0.0006, "num_tokens": 41393028.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8008705377578735, "sampling/importance_sampling_ratio/mean": 1.000395655632019, "sampling/importance_sampling_ratio/min": 0.49519240856170654, "sampling/sampling_logp_difference/max": 0.7028088569641113, "sampling/sampling_logp_difference/mean": 0.01474726665765047, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 194.96875, "completions/mean_terminated_length": 194.96875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.1974889189004898, "epoch": 1.6029411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.4297146563298104, "kl": 0.08590207993984222, "learning_rate": 5.295465173051491e-07, "loss": 0.077, "num_tokens": 41423906.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.755322813987732, "sampling/importance_sampling_ratio/mean": 0.9999452233314514, "sampling/importance_sampling_ratio/min": 0.6205163598060608, "sampling/sampling_logp_difference/max": 0.5626528263092041, "sampling/sampling_logp_difference/mean": 0.012646855786442757, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 228.6875, "completions/mean_terminated_length": 228.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.31509193778038025, "epoch": 1.6041666666666665, "frac_reward_zero_std": 0.5, "grad_norm": 1.8576238928473223, "kl": 0.12904363870620728, "learning_rate": 5.288353531676873e-07, "loss": -0.0124, "num_tokens": 41454750.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5882036685943604, "sampling/importance_sampling_ratio/mean": 0.9996026754379272, "sampling/importance_sampling_ratio/min": 0.5271825194358826, "sampling/sampling_logp_difference/max": 0.6402084827423096, "sampling/sampling_logp_difference/mean": 0.015689942985773087, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 140.96875, "completions/mean_terminated_length": 140.96875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.1562458574771881, "epoch": 1.6053921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.09737824294354848, "kl": 0.09509014338254929, "learning_rate": 5.281241304962852e-07, "loss": 0.0009, "num_tokens": 41480060.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5647554397583008, "sampling/importance_sampling_ratio/mean": 0.999330461025238, "sampling/importance_sampling_ratio/min": 0.44270896911621094, "sampling/sampling_logp_difference/max": 0.814842700958252, "sampling/sampling_logp_difference/mean": 0.01225283369421959, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 164.140625, "completions/mean_terminated_length": 164.140625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.17961689829826355, "epoch": 1.6066176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.12433184014965502, "kl": 0.12926530838012695, "learning_rate": 5.2741285073468e-07, "loss": 0.0012, "num_tokens": 41516757.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 0.9998909831047058, "sampling/importance_sampling_ratio/min": 0.4443853497505188, "sampling/sampling_logp_difference/max": 0.8110632300376892, "sampling/sampling_logp_difference/mean": 0.011371592059731483, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 190.703125, "completions/mean_terminated_length": 190.703125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2711417078971863, "epoch": 1.607843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 2.7073808572881557, "kl": 0.15485966205596924, "learning_rate": 5.267015153267245e-07, "loss": 0.0367, "num_tokens": 41548786.0, "reward": 0.59375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005667209625244, "sampling/importance_sampling_ratio/min": 0.4746485650539398, "sampling/sampling_logp_difference/max": 0.8474371433258057, "sampling/sampling_logp_difference/mean": 0.016850367188453674, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 183.28125, "completions/mean_terminated_length": 183.28125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.25501537322998047, "epoch": 1.6090686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 2.093866343919497, "kl": 0.11585363000631332, "learning_rate": 5.259901257163844e-07, "loss": 0.0053, "num_tokens": 41577060.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6257047653198242, "sampling/importance_sampling_ratio/mean": 0.9992328882217407, "sampling/importance_sampling_ratio/min": 0.4216938018798828, "sampling/sampling_logp_difference/max": 0.8634757995605469, "sampling/sampling_logp_difference/mean": 0.01534545049071312, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 133.21875, "completions/mean_terminated_length": 133.21875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2237296998500824, "epoch": 1.6102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.055933878857765205, "kl": 0.07522813230752945, "learning_rate": 5.252786833477358e-07, "loss": 0.0007, "num_tokens": 41605218.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.558457612991333, "sampling/importance_sampling_ratio/mean": 0.9998308420181274, "sampling/importance_sampling_ratio/min": 0.48925769329071045, "sampling/sampling_logp_difference/max": 0.7148659229278564, "sampling/sampling_logp_difference/mean": 0.016113966703414917, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 259.734375, "completions/mean_terminated_length": 259.734375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.24126943945884705, "epoch": 1.6115196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.9630807576324676, "kl": 0.08669568598270416, "learning_rate": 5.245671896649612e-07, "loss": 0.0099, "num_tokens": 41640865.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997607469558716, "sampling/importance_sampling_ratio/min": 0.506397008895874, "sampling/sampling_logp_difference/max": 1.1126689910888672, "sampling/sampling_logp_difference/mean": 0.012919275090098381, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 242.28125, "completions/mean_terminated_length": 242.28125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.227101132273674, "epoch": 1.6127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03724252908873951, "kl": 0.06919814646244049, "learning_rate": 5.23855646112348e-07, "loss": 0.0006, "num_tokens": 41671667.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9311466217041016, "sampling/importance_sampling_ratio/mean": 0.9998438358306885, "sampling/importance_sampling_ratio/min": 0.6051800847053528, "sampling/sampling_logp_difference/max": 0.658113956451416, "sampling/sampling_logp_difference/mean": 0.013156576082110405, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 270.453125, "completions/mean_terminated_length": 270.453125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.30856984853744507, "epoch": 1.6139705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.1497588769228289, "kl": 0.09006384760141373, "learning_rate": 5.231440541342845e-07, "loss": -0.0173, "num_tokens": 41704480.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5483697652816772, "sampling/importance_sampling_ratio/mean": 1.0000381469726562, "sampling/importance_sampling_ratio/min": 0.5144515037536621, "sampling/sampling_logp_difference/max": 0.664654016494751, "sampling/sampling_logp_difference/mean": 0.015054329298436642, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 201.21875, "completions/mean_terminated_length": 201.21875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.18170005083084106, "epoch": 1.6151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.05194959550465986, "kl": 0.06692922860383987, "learning_rate": 5.224324151752575e-07, "loss": 0.0007, "num_tokens": 41737742.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000636339187622, "sampling/importance_sampling_ratio/min": 0.3987736403942108, "sampling/sampling_logp_difference/max": 0.9193613529205322, "sampling/sampling_logp_difference/mean": 0.011669599451124668, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 197.078125, "completions/mean_terminated_length": 197.078125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.29809680581092834, "epoch": 1.616421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.6056980751564378, "kl": 0.09784778952598572, "learning_rate": 5.217207306798487e-07, "loss": -0.0102, "num_tokens": 41767475.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003911256790161, "sampling/importance_sampling_ratio/min": 0.6152052879333496, "sampling/sampling_logp_difference/max": 0.8550095558166504, "sampling/sampling_logp_difference/mean": 0.01622714102268219, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 157.53125, "completions/mean_terminated_length": 157.53125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.20960770547389984, "epoch": 1.6176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.2581966492870358, "kl": 0.07081106305122375, "learning_rate": 5.210090020927326e-07, "loss": -0.0059, "num_tokens": 41795717.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000015497207642, "sampling/importance_sampling_ratio/min": 0.5700021982192993, "sampling/sampling_logp_difference/max": 1.0141346454620361, "sampling/sampling_logp_difference/mean": 0.013170900754630566, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 205.828125, "completions/mean_terminated_length": 205.828125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.24730682373046875, "epoch": 1.6188725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.052956459955202366, "kl": 0.08272158354520798, "learning_rate": 5.202972308586735e-07, "loss": 0.0008, "num_tokens": 41831658.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994138479232788, "sampling/importance_sampling_ratio/min": 0.5493014454841614, "sampling/sampling_logp_difference/max": 0.7052476406097412, "sampling/sampling_logp_difference/mean": 0.015009921044111252, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 212.0625, "completions/mean_terminated_length": 212.0625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.21910163760185242, "epoch": 1.6200980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.05277614432556432, "kl": 0.0917169526219368, "learning_rate": 5.195854184225213e-07, "loss": 0.0009, "num_tokens": 41863502.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6503266096115112, "sampling/importance_sampling_ratio/mean": 1.0001317262649536, "sampling/importance_sampling_ratio/min": 0.5389890670776367, "sampling/sampling_logp_difference/max": 0.6180599927902222, "sampling/sampling_logp_difference/mean": 0.012834159657359123, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 183.421875, "completions/mean_terminated_length": 183.421875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.22714023292064667, "epoch": 1.6213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.06480517359148961, "kl": 0.08311962336301804, "learning_rate": 5.188735662292107e-07, "loss": 0.0008, "num_tokens": 41891081.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6613457202911377, "sampling/importance_sampling_ratio/mean": 1.0001468658447266, "sampling/importance_sampling_ratio/min": 0.47708699107170105, "sampling/sampling_logp_difference/max": 0.7400565147399902, "sampling/sampling_logp_difference/mean": 0.01521068811416626, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 177.078125, "completions/mean_terminated_length": 177.078125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2419014871120453, "epoch": 1.6225490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 2.0061410801489905, "kl": 0.10726620256900787, "learning_rate": 5.181616757237561e-07, "loss": 0.034, "num_tokens": 41917582.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.9397830963134766, "sampling/importance_sampling_ratio/mean": 0.9992237687110901, "sampling/importance_sampling_ratio/min": 0.5363001227378845, "sampling/sampling_logp_difference/max": 0.6625761985778809, "sampling/sampling_logp_difference/mean": 0.014850424602627754, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 220.28125, "completions/mean_terminated_length": 220.28125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.2679051160812378, "epoch": 1.6237745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.1707494264096197, "kl": 0.10701704770326614, "learning_rate": 5.174497483512505e-07, "loss": -0.023, "num_tokens": 41949152.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.627264380455017, "sampling/importance_sampling_ratio/mean": 0.9996857643127441, "sampling/importance_sampling_ratio/min": 0.4389006197452545, "sampling/sampling_logp_difference/max": 0.8234822750091553, "sampling/sampling_logp_difference/mean": 0.015957824885845184, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 166.578125, "completions/mean_terminated_length": 166.578125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.23362596333026886, "epoch": 1.625, "frac_reward_zero_std": 0.75, "grad_norm": 1.4933756703267782, "kl": 0.0941019356250763, "learning_rate": 5.167377855568612e-07, "loss": 0.0157, "num_tokens": 41980773.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.673134684562683, "sampling/importance_sampling_ratio/mean": 0.999595046043396, "sampling/importance_sampling_ratio/min": 0.5383061766624451, "sampling/sampling_logp_difference/max": 0.6193277835845947, "sampling/sampling_logp_difference/mean": 0.014315936714410782, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 200.015625, "completions/mean_terminated_length": 200.015625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.29066771268844604, "epoch": 1.6262254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.454954066410852, "kl": 0.10306625068187714, "learning_rate": 5.160257887858277e-07, "loss": -0.0341, "num_tokens": 42019382.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001888275146484, "sampling/importance_sampling_ratio/min": 0.2848387658596039, "sampling/sampling_logp_difference/max": 1.2558319568634033, "sampling/sampling_logp_difference/mean": 0.01684502884745598, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 172.640625, "completions/mean_terminated_length": 172.640625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.21323218941688538, "epoch": 1.6274509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.5167060367394343, "kl": 0.07055232673883438, "learning_rate": 5.15313759483458e-07, "loss": 0.1168, "num_tokens": 42045663.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.808783769607544, "sampling/importance_sampling_ratio/mean": 1.0004042387008667, "sampling/importance_sampling_ratio/min": 0.5261567831039429, "sampling/sampling_logp_difference/max": 0.6421560049057007, "sampling/sampling_logp_difference/mean": 0.013901956379413605, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3145783245563507, "epoch": 1.6286764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.4238544507154678, "kl": 0.142796128988266, "learning_rate": 5.146016990951268e-07, "loss": -0.0195, "num_tokens": 42075567.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992316365242004, "sampling/importance_sampling_ratio/min": 0.24553349614143372, "sampling/sampling_logp_difference/max": 1.4043219089508057, "sampling/sampling_logp_difference/mean": 0.017758475616574287, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 188.28125, "completions/mean_terminated_length": 188.28125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2112322449684143, "epoch": 1.6299019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.051044574497776975, "kl": 0.07780525088310242, "learning_rate": 5.138896090662714e-07, "loss": 0.0007, "num_tokens": 42107761.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8814224004745483, "sampling/importance_sampling_ratio/mean": 1.0002202987670898, "sampling/importance_sampling_ratio/min": 0.48437368869781494, "sampling/sampling_logp_difference/max": 0.7248985767364502, "sampling/sampling_logp_difference/mean": 0.0132973063737154, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 200.0625, "completions/mean_terminated_length": 200.0625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2907071113586426, "epoch": 1.6311274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.599292742810804, "kl": 0.11069557815790176, "learning_rate": 5.131774908423898e-07, "loss": 0.0143, "num_tokens": 42136021.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8153657913208008, "sampling/importance_sampling_ratio/mean": 0.9995972514152527, "sampling/importance_sampling_ratio/min": 0.48853954672813416, "sampling/sampling_logp_difference/max": 0.7163348197937012, "sampling/sampling_logp_difference/mean": 0.016580259427428246, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 195.109375, "completions/mean_terminated_length": 195.109375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.23820869624614716, "epoch": 1.6323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.054324159375675335, "kl": 0.07167470455169678, "learning_rate": 5.124653458690365e-07, "loss": 0.0007, "num_tokens": 42166828.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6654973030090332, "sampling/importance_sampling_ratio/mean": 0.9998932480812073, "sampling/importance_sampling_ratio/min": 0.5910135507583618, "sampling/sampling_logp_difference/max": 0.525916337966919, "sampling/sampling_logp_difference/mean": 0.014437150210142136, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 181.84375, "completions/mean_terminated_length": 181.84375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.22880329191684723, "epoch": 1.633578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.05272930597249983, "kl": 0.08534187078475952, "learning_rate": 5.117531755918207e-07, "loss": 0.0008, "num_tokens": 42194562.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5075865983963013, "sampling/importance_sampling_ratio/mean": 0.9995811581611633, "sampling/importance_sampling_ratio/min": 0.5517650842666626, "sampling/sampling_logp_difference/max": 0.5946328639984131, "sampling/sampling_logp_difference/mean": 0.013283336535096169, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 187.6875, "completions/mean_terminated_length": 187.6875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.22426791489124298, "epoch": 1.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0707230428521295, "kl": 0.09419456869363785, "learning_rate": 5.110409814564031e-07, "loss": 0.0009, "num_tokens": 42229310.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5982849597930908, "sampling/importance_sampling_ratio/mean": 0.999759316444397, "sampling/importance_sampling_ratio/min": 0.5677502155303955, "sampling/sampling_logp_difference/max": 0.5660736560821533, "sampling/sampling_logp_difference/mean": 0.013859902508556843, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 183.890625, "completions/mean_terminated_length": 183.890625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3586990237236023, "epoch": 1.6360294117647058, "frac_reward_zero_std": 0.25, "grad_norm": 2.3049812417751974, "kl": 0.158889502286911, "learning_rate": 5.103287649084926e-07, "loss": -0.0103, "num_tokens": 42258151.0, "reward": 0.34375, "reward_std": 0.676956295967102, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6215449571609497, "sampling/importance_sampling_ratio/mean": 0.9993545413017273, "sampling/importance_sampling_ratio/min": 0.40965768694877625, "sampling/sampling_logp_difference/max": 0.8924334049224854, "sampling/sampling_logp_difference/mean": 0.018728770315647125, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 165.9375, "completions/mean_terminated_length": 165.9375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.18399828672409058, "epoch": 1.6372549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.5395292659691353, "kl": 0.08521635830402374, "learning_rate": 5.096165273938435e-07, "loss": 0.0204, "num_tokens": 42285651.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9603852033615112, "sampling/importance_sampling_ratio/mean": 1.0003844499588013, "sampling/importance_sampling_ratio/min": 0.6114168167114258, "sampling/sampling_logp_difference/max": 0.6731410026550293, "sampling/sampling_logp_difference/mean": 0.011999163776636124, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 175.109375, "completions/mean_terminated_length": 175.109375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2937619090080261, "epoch": 1.6384803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 2.034304557509628, "kl": 0.15264631807804108, "learning_rate": 5.089042703582533e-07, "loss": 0.0115, "num_tokens": 42314202.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.7399550676345825, "sampling/importance_sampling_ratio/mean": 0.9997891187667847, "sampling/importance_sampling_ratio/min": 0.44116657972335815, "sampling/sampling_logp_difference/max": 0.8183327317237854, "sampling/sampling_logp_difference/mean": 0.016184452921152115, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 212.71875, "completions/mean_terminated_length": 212.71875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3052104115486145, "epoch": 1.6397058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 2.2976913823008105, "kl": 0.11412011086940765, "learning_rate": 5.081919952475583e-07, "loss": 0.0008, "num_tokens": 42351208.0, "reward": 0.5, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5723400115966797, "sampling/importance_sampling_ratio/mean": 0.9997768402099609, "sampling/importance_sampling_ratio/min": 0.44444528222084045, "sampling/sampling_logp_difference/max": 0.8109283447265625, "sampling/sampling_logp_difference/mean": 0.017344534397125244, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 190.578125, "completions/mean_terminated_length": 190.578125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2736814618110657, "epoch": 1.6409313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 1.8251000890125455, "kl": 0.08616019040346146, "learning_rate": 5.074797035076318e-07, "loss": -0.0145, "num_tokens": 42378077.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8718525171279907, "sampling/importance_sampling_ratio/mean": 1.000880241394043, "sampling/importance_sampling_ratio/min": 0.6115580201148987, "sampling/sampling_logp_difference/max": 0.6269285678863525, "sampling/sampling_logp_difference/mean": 0.014431033283472061, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 228.140625, "completions/mean_terminated_length": 228.140625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2970915138721466, "epoch": 1.642156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.9868492072910282, "kl": 0.0832802876830101, "learning_rate": 5.067673965843812e-07, "loss": -0.011, "num_tokens": 42409382.0, "reward": 0.4375, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99962317943573, "sampling/importance_sampling_ratio/min": 0.2602560520172119, "sampling/sampling_logp_difference/max": 1.3460893630981445, "sampling/sampling_logp_difference/mean": 0.015678174793720245, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 250.53125, "completions/mean_terminated_length": 250.53125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.26596561074256897, "epoch": 1.6433823529411766, "frac_reward_zero_std": 0.25, "grad_norm": 1.7119751113149253, "kl": 0.07231054455041885, "learning_rate": 5.060550759237441e-07, "loss": -0.0751, "num_tokens": 42441896.0, "reward": 0.5625, "reward_std": 0.5915650129318237, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006382465362549, "sampling/importance_sampling_ratio/min": 0.17431412637233734, "sampling/sampling_logp_difference/max": 1.7468962669372559, "sampling/sampling_logp_difference/mean": 0.01413971371948719, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 190.1875, "completions/mean_terminated_length": 190.1875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.24031300842761993, "epoch": 1.6446078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 1.333026429331567, "kl": 0.08541548997163773, "learning_rate": 5.053427429716866e-07, "loss": 0.0076, "num_tokens": 42473668.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7207510471343994, "sampling/importance_sampling_ratio/mean": 1.000162124633789, "sampling/importance_sampling_ratio/min": 0.5210290551185608, "sampling/sampling_logp_difference/max": 0.6519495248794556, "sampling/sampling_logp_difference/mean": 0.014477924443781376, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 197.921875, "completions/mean_terminated_length": 197.921875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.25727108120918274, "epoch": 1.6458333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 2.016201733244008, "kl": 0.09245032072067261, "learning_rate": 5.046303991741993e-07, "loss": -0.0402, "num_tokens": 42503807.0, "reward": 0.71875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6272691488265991, "sampling/importance_sampling_ratio/mean": 0.9992700815200806, "sampling/importance_sampling_ratio/min": 0.6096329092979431, "sampling/sampling_logp_difference/max": 0.49489831924438477, "sampling/sampling_logp_difference/mean": 0.013924137689173222, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 207.1875, "completions/mean_terminated_length": 207.1875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3347078263759613, "epoch": 1.6470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.06318886619615308, "kl": 0.11795598268508911, "learning_rate": 5.039180459772949e-07, "loss": 0.0012, "num_tokens": 42534891.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001790523529053, "sampling/importance_sampling_ratio/min": 0.4022471308708191, "sampling/sampling_logp_difference/max": 0.9106886386871338, "sampling/sampling_logp_difference/mean": 0.018503038212656975, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 136.9375, "completions/mean_terminated_length": 136.9375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.21099574863910675, "epoch": 1.6482843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.06328075753077166, "kl": 0.08794344216585159, "learning_rate": 5.032056848270056e-07, "loss": 0.0009, "num_tokens": 42558279.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.561264991760254, "sampling/importance_sampling_ratio/mean": 0.9999595880508423, "sampling/importance_sampling_ratio/min": 0.613577663898468, "sampling/sampling_logp_difference/max": 0.48844844102859497, "sampling/sampling_logp_difference/mean": 0.013360563665628433, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.20779091119766235, "epoch": 1.6495098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 2.2777246815639125, "kl": 0.09795111417770386, "learning_rate": 5.02493317169379e-07, "loss": -0.0148, "num_tokens": 42583055.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7928379774093628, "sampling/importance_sampling_ratio/mean": 1.0002166032791138, "sampling/importance_sampling_ratio/min": 0.5677189826965332, "sampling/sampling_logp_difference/max": 0.5837998390197754, "sampling/sampling_logp_difference/mean": 0.013950050808489323, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 157.734375, "completions/mean_terminated_length": 157.734375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.23696336150169373, "epoch": 1.6507352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.059585662552681225, "kl": 0.10689768195152283, "learning_rate": 5.017809444504767e-07, "loss": 0.0011, "num_tokens": 42612366.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6983774900436401, "sampling/importance_sampling_ratio/mean": 1.0000231266021729, "sampling/importance_sampling_ratio/min": 0.5634658336639404, "sampling/sampling_logp_difference/max": 0.5736486911773682, "sampling/sampling_logp_difference/mean": 0.015318666584789753, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 236.890625, "completions/mean_terminated_length": 236.890625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.23486870527267456, "epoch": 1.6519607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.1154581101547827, "kl": 0.07553314417600632, "learning_rate": 5.010685681163698e-07, "loss": 0.0561, "num_tokens": 42647559.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000604391098022, "sampling/importance_sampling_ratio/min": 0.3292165994644165, "sampling/sampling_logp_difference/max": 1.111039400100708, "sampling/sampling_logp_difference/mean": 0.013525919988751411, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 184.1875, "completions/mean_terminated_length": 184.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3114980161190033, "epoch": 1.653186274509804, "frac_reward_zero_std": 0.25, "grad_norm": 2.2259090560583856, "kl": 0.11155524849891663, "learning_rate": 5.003561896131374e-07, "loss": 0.0145, "num_tokens": 42680403.0, "reward": 0.8125, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5709562301635742, "sampling/importance_sampling_ratio/mean": 0.9993911385536194, "sampling/importance_sampling_ratio/min": 0.6183300614356995, "sampling/sampling_logp_difference/max": 0.4807329773902893, "sampling/sampling_logp_difference/mean": 0.01637909933924675, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 203.78125, "completions/mean_terminated_length": 203.78125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.23865506052970886, "epoch": 1.6544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.04475247985952317, "kl": 0.07838784903287888, "learning_rate": 4.996438103868625e-07, "loss": 0.0007, "num_tokens": 42712293.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7526777982711792, "sampling/importance_sampling_ratio/mean": 1.0003411769866943, "sampling/importance_sampling_ratio/min": 0.6225943565368652, "sampling/sampling_logp_difference/max": 0.5611448287963867, "sampling/sampling_logp_difference/mean": 0.014617128297686577, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 203.96875, "completions/mean_terminated_length": 203.96875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2744845449924469, "epoch": 1.655637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.7408160022259205, "kl": 0.11861857771873474, "learning_rate": 4.989314318836302e-07, "loss": -0.0126, "num_tokens": 42741907.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.54710054397583, "sampling/importance_sampling_ratio/mean": 0.999839186668396, "sampling/importance_sampling_ratio/min": 0.5887571573257446, "sampling/sampling_logp_difference/max": 0.5297415256500244, "sampling/sampling_logp_difference/mean": 0.016750134527683258, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 150.28125, "completions/mean_terminated_length": 150.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18784059584140778, "epoch": 1.656862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.8598833573971085, "kl": 0.13575217127799988, "learning_rate": 4.982190555495235e-07, "loss": -0.012, "num_tokens": 42765061.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7080327272415161, "sampling/importance_sampling_ratio/mean": 0.9997662901878357, "sampling/importance_sampling_ratio/min": 0.5367580056190491, "sampling/sampling_logp_difference/max": 0.6222078800201416, "sampling/sampling_logp_difference/mean": 0.013174982741475105, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 183.921875, "completions/mean_terminated_length": 183.921875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2240990251302719, "epoch": 1.6580882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.86401773438621, "kl": 0.08727099746465683, "learning_rate": 4.975066828306209e-07, "loss": -0.0095, "num_tokens": 42792416.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005711317062378, "sampling/importance_sampling_ratio/min": 0.4520740509033203, "sampling/sampling_logp_difference/max": 0.8031885623931885, "sampling/sampling_logp_difference/mean": 0.015043207444250584, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 162.65625, "completions/mean_terminated_length": 162.65625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2405478060245514, "epoch": 1.659313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 2.242041603566026, "kl": 0.16278581321239471, "learning_rate": 4.967943151729944e-07, "loss": 0.0576, "num_tokens": 42817242.0, "reward": 0.59375, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6590238809585571, "sampling/importance_sampling_ratio/mean": 0.9999129772186279, "sampling/importance_sampling_ratio/min": 0.5087666511535645, "sampling/sampling_logp_difference/max": 0.6757657527923584, "sampling/sampling_logp_difference/mean": 0.013905617408454418, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 160.71875, "completions/mean_terminated_length": 160.71875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.1898893117904663, "epoch": 1.6605392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.3933831662992229, "kl": 0.07735336571931839, "learning_rate": 4.96081954022705e-07, "loss": -0.0671, "num_tokens": 42842168.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.403991460800171, "sampling/importance_sampling_ratio/mean": 0.9996185302734375, "sampling/importance_sampling_ratio/min": 0.5681336522102356, "sampling/sampling_logp_difference/max": 0.5653985738754272, "sampling/sampling_logp_difference/mean": 0.011938979849219322, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2664642632007599, "epoch": 1.6617647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 1.1981326528665563, "kl": 0.09241677820682526, "learning_rate": 4.953696008258008e-07, "loss": 0.0475, "num_tokens": 42868064.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6814212799072266, "sampling/importance_sampling_ratio/mean": 1.0005993843078613, "sampling/importance_sampling_ratio/min": 0.5139349699020386, "sampling/sampling_logp_difference/max": 0.6656584739685059, "sampling/sampling_logp_difference/mean": 0.015398381277918816, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 161.796875, "completions/mean_terminated_length": 161.796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2700577974319458, "epoch": 1.6629901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 2.531257986742057, "kl": 0.2919192314147949, "learning_rate": 4.946572570283134e-07, "loss": 0.0126, "num_tokens": 42894803.0, "reward": 0.59375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6232237815856934, "sampling/importance_sampling_ratio/mean": 0.9996989965438843, "sampling/importance_sampling_ratio/min": 0.013000335544347763, "sampling/sampling_logp_difference/max": 4.342780113220215, "sampling/sampling_logp_difference/mean": 0.01691490039229393, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.339409738779068, "epoch": 1.6642156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 1.8952017692855232, "kl": 0.12340100109577179, "learning_rate": 4.939449240762558e-07, "loss": -0.0421, "num_tokens": 42922131.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000413417816162, "sampling/importance_sampling_ratio/min": 0.4595952332019806, "sampling/sampling_logp_difference/max": 0.7774090766906738, "sampling/sampling_logp_difference/mean": 0.01715453900396824, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 145.953125, "completions/mean_terminated_length": 145.953125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.23316159844398499, "epoch": 1.6654411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0963002270490013, "kl": 0.10303225368261337, "learning_rate": 4.932326034156189e-07, "loss": 0.001, "num_tokens": 42950576.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6010249853134155, "sampling/importance_sampling_ratio/mean": 1.0004855394363403, "sampling/importance_sampling_ratio/min": 0.5264350175857544, "sampling/sampling_logp_difference/max": 0.641627311706543, "sampling/sampling_logp_difference/mean": 0.01539541594684124, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 153.203125, "completions/mean_terminated_length": 153.203125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.18769334256649017, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 1.613985052269561, "kl": 0.09362373501062393, "learning_rate": 4.925202964923683e-07, "loss": 0.0005, "num_tokens": 42974941.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999897480010986, "sampling/importance_sampling_ratio/min": 0.5657834410667419, "sampling/sampling_logp_difference/max": 0.7541909217834473, "sampling/sampling_logp_difference/mean": 0.014007753692567348, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 219.8125, "completions/mean_terminated_length": 219.8125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.25269484519958496, "epoch": 1.6678921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 1.5837754600410165, "kl": 0.09320004284381866, "learning_rate": 4.918080047524417e-07, "loss": 0.0256, "num_tokens": 43004049.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003734827041626, "sampling/importance_sampling_ratio/min": 0.6183974146842957, "sampling/sampling_logp_difference/max": 0.7329139709472656, "sampling/sampling_logp_difference/mean": 0.014377745799720287, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 142.515625, "completions/mean_terminated_length": 142.515625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2503388822078705, "epoch": 1.6691176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.19241528737319635, "kl": 0.17062516510486603, "learning_rate": 4.910957296417467e-07, "loss": 0.0019, "num_tokens": 43025442.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.693943977355957, "sampling/importance_sampling_ratio/mean": 0.999955415725708, "sampling/importance_sampling_ratio/min": 0.4887165129184723, "sampling/sampling_logp_difference/max": 0.7159726619720459, "sampling/sampling_logp_difference/mean": 0.01492222212255001, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 193.578125, "completions/mean_terminated_length": 193.578125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.30455151200294495, "epoch": 1.670343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.6876242542682152, "kl": 0.16102200746536255, "learning_rate": 4.903834726061564e-07, "loss": -0.0021, "num_tokens": 43059143.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998186826705933, "sampling/importance_sampling_ratio/min": 0.5035651922225952, "sampling/sampling_logp_difference/max": 0.7266943454742432, "sampling/sampling_logp_difference/mean": 0.017101185396313667, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 174.03125, "completions/mean_terminated_length": 174.03125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.24065756797790527, "epoch": 1.6715686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.1289676912772164, "kl": 0.09058039635419846, "learning_rate": 4.896712350915074e-07, "loss": 0.0009, "num_tokens": 43095993.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998894333839417, "sampling/importance_sampling_ratio/min": 0.4640738368034363, "sampling/sampling_logp_difference/max": 0.7677116394042969, "sampling/sampling_logp_difference/mean": 0.01515038963407278, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 191.78125, "completions/mean_terminated_length": 191.78125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.21339699625968933, "epoch": 1.6727941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.5234317958528387, "kl": 0.09759987890720367, "learning_rate": 4.889590185435969e-07, "loss": -0.0205, "num_tokens": 43127371.0, "reward": 0.375, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.85578191280365, "sampling/importance_sampling_ratio/mean": 0.9998263120651245, "sampling/importance_sampling_ratio/min": 0.5853095054626465, "sampling/sampling_logp_difference/max": 0.6183061599731445, "sampling/sampling_logp_difference/mean": 0.013212048448622227, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4170733094215393, "epoch": 1.6740196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 2.0924402915304783, "kl": 0.18056243658065796, "learning_rate": 4.882468244081792e-07, "loss": 0.0004, "num_tokens": 43163827.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6581690311431885, "sampling/importance_sampling_ratio/mean": 1.0002093315124512, "sampling/importance_sampling_ratio/min": 0.5444231629371643, "sampling/sampling_logp_difference/max": 0.6080284118652344, "sampling/sampling_logp_difference/mean": 0.020893795415759087, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 242.46875, "completions/mean_terminated_length": 242.46875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.17147576808929443, "epoch": 1.6752450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.0212249478750737, "kl": 0.10019063949584961, "learning_rate": 4.875346541309636e-07, "loss": 0.0077, "num_tokens": 43197793.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.646515130996704, "sampling/importance_sampling_ratio/mean": 1.000142216682434, "sampling/importance_sampling_ratio/min": 0.617138683795929, "sampling/sampling_logp_difference/max": 0.4986610412597656, "sampling/sampling_logp_difference/mean": 0.0103620495647192, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 233.5625, "completions/mean_terminated_length": 233.5625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2481188178062439, "epoch": 1.6764705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.3466141791726334, "kl": 0.07373310625553131, "learning_rate": 4.868225091576102e-07, "loss": 0.0171, "num_tokens": 43230949.0, "reward": 0.3125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4394497871398926, "sampling/importance_sampling_ratio/mean": 0.9995990991592407, "sampling/importance_sampling_ratio/min": 0.5695202350616455, "sampling/sampling_logp_difference/max": 0.5629609823226929, "sampling/sampling_logp_difference/mean": 0.013606008142232895, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 177.359375, "completions/mean_terminated_length": 177.359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.28148430585861206, "epoch": 1.6776960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.9725140313637766, "kl": 0.10703254491090775, "learning_rate": 4.861103909337285e-07, "loss": 0.0291, "num_tokens": 43260828.0, "reward": 0.5, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004048347473145, "sampling/importance_sampling_ratio/min": 0.09236104041337967, "sampling/sampling_logp_difference/max": 2.382050037384033, "sampling/sampling_logp_difference/mean": 0.014763194136321545, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 194.328125, "completions/mean_terminated_length": 194.328125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.289120078086853, "epoch": 1.678921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.09266729126875, "kl": 0.11325451731681824, "learning_rate": 4.853983009048732e-07, "loss": -0.0346, "num_tokens": 43293713.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5475056171417236, "sampling/importance_sampling_ratio/mean": 1.0003206729888916, "sampling/importance_sampling_ratio/min": 0.5568208694458008, "sampling/sampling_logp_difference/max": 0.5855116844177246, "sampling/sampling_logp_difference/mean": 0.015658417716622353, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 210.890625, "completions/mean_terminated_length": 210.890625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3526790142059326, "epoch": 1.6801470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 2.0867761324539056, "kl": 0.15972676873207092, "learning_rate": 4.84686240516542e-07, "loss": 0.0229, "num_tokens": 43323930.0, "reward": 0.3125, "reward_std": 0.6143567562103271, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5467427968978882, "sampling/importance_sampling_ratio/mean": 1.0000510215759277, "sampling/importance_sampling_ratio/min": 0.47393912076950073, "sampling/sampling_logp_difference/max": 0.7466764450073242, "sampling/sampling_logp_difference/mean": 0.017478249967098236, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.20949649810791016, "epoch": 1.6813725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.2638909903023694, "kl": 0.06713837385177612, "learning_rate": 4.839742112141724e-07, "loss": 0.0452, "num_tokens": 43354642.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001473426818848, "sampling/importance_sampling_ratio/min": 0.4337219297885895, "sampling/sampling_logp_difference/max": 0.8726010322570801, "sampling/sampling_logp_difference/mean": 0.01308098528534174, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 190.109375, "completions/mean_terminated_length": 190.109375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.29467177391052246, "epoch": 1.6825980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.8858091141384608, "kl": 0.11413323134183884, "learning_rate": 4.832622144431388e-07, "loss": 0.0203, "num_tokens": 43385577.0, "reward": 0.5, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5071555376052856, "sampling/importance_sampling_ratio/mean": 0.9998645186424255, "sampling/importance_sampling_ratio/min": 0.28268373012542725, "sampling/sampling_logp_difference/max": 1.2634265422821045, "sampling/sampling_logp_difference/mean": 0.01592201367020607, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 206.546875, "completions/mean_terminated_length": 206.546875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3291833996772766, "epoch": 1.6838235294117647, "frac_reward_zero_std": 0.25, "grad_norm": 1.9178705450875981, "kl": 0.12914146482944489, "learning_rate": 4.825502516487496e-07, "loss": 0.0166, "num_tokens": 43418764.0, "reward": -0.125, "reward_std": 0.6663130521774292, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6523675918579102, "sampling/importance_sampling_ratio/mean": 1.0004007816314697, "sampling/importance_sampling_ratio/min": 0.6056222319602966, "sampling/sampling_logp_difference/max": 0.5022091865539551, "sampling/sampling_logp_difference/mean": 0.01776885613799095, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 203.453125, "completions/mean_terminated_length": 203.453125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3313523530960083, "epoch": 1.6850490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 1.8827964833937998, "kl": 0.14949117600917816, "learning_rate": 4.818383242762439e-07, "loss": 0.0148, "num_tokens": 43456601.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8197872638702393, "sampling/importance_sampling_ratio/mean": 0.9996486902236938, "sampling/importance_sampling_ratio/min": 0.5531119704246521, "sampling/sampling_logp_difference/max": 0.598719596862793, "sampling/sampling_logp_difference/mean": 0.017888199537992477, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 195.15625, "completions/mean_terminated_length": 195.15625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2908930778503418, "epoch": 1.6862745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 2.004422595938958, "kl": 0.16500240564346313, "learning_rate": 4.811264337707894e-07, "loss": -0.0089, "num_tokens": 43484275.0, "reward": 0.15625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.603110432624817, "sampling/importance_sampling_ratio/mean": 1.0009064674377441, "sampling/importance_sampling_ratio/min": 0.6171850562095642, "sampling/sampling_logp_difference/max": 0.4825863838195801, "sampling/sampling_logp_difference/mean": 0.014188327826559544, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 178.0, "completions/mean_terminated_length": 178.0, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2866644859313965, "epoch": 1.6875, "frac_reward_zero_std": 0.5, "grad_norm": 2.153546925697768, "kl": 0.17025037109851837, "learning_rate": 4.804145815774786e-07, "loss": 0.0083, "num_tokens": 43515667.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002154111862183, "sampling/importance_sampling_ratio/min": 0.342887818813324, "sampling/sampling_logp_difference/max": 1.0703519582748413, "sampling/sampling_logp_difference/mean": 0.01696593686938286, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 181.03125, "completions/mean_terminated_length": 181.03125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.29743701219558716, "epoch": 1.6887254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 2.0662312476330906, "kl": 0.1554834097623825, "learning_rate": 4.797027691413267e-07, "loss": 0.0335, "num_tokens": 43541717.0, "reward": -0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.580533504486084, "sampling/importance_sampling_ratio/mean": 1.000537395477295, "sampling/importance_sampling_ratio/min": 0.5968798398971558, "sampling/sampling_logp_difference/max": 0.5160394906997681, "sampling/sampling_logp_difference/mean": 0.016106370836496353, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 255.859375, "completions/mean_terminated_length": 255.859375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.33325445652008057, "epoch": 1.6899509803921569, "frac_reward_zero_std": 0.0, "grad_norm": 2.158591515821032, "kl": 0.11241887509822845, "learning_rate": 4.789909979072673e-07, "loss": -0.04, "num_tokens": 43579564.0, "reward": 0.25, "reward_std": 0.9245119094848633, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4743503332138062, "sampling/importance_sampling_ratio/mean": 0.9997340440750122, "sampling/importance_sampling_ratio/min": 0.5142372846603394, "sampling/sampling_logp_difference/max": 0.6650705337524414, "sampling/sampling_logp_difference/mean": 0.017467955127358437, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 204.3125, "completions/mean_terminated_length": 204.3125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35654231905937195, "epoch": 1.6911764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.9717870161455187, "kl": 0.13017581403255463, "learning_rate": 4.782792693201513e-07, "loss": -0.0003, "num_tokens": 43610064.0, "reward": -0.53125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": -0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.9441932439804077, "sampling/importance_sampling_ratio/mean": 0.9997327327728271, "sampling/importance_sampling_ratio/min": 0.42920300364494324, "sampling/sampling_logp_difference/max": 0.8458253145217896, "sampling/sampling_logp_difference/mean": 0.018064867705106735, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.33480149507522583, "epoch": 1.6924019607843137, "frac_reward_zero_std": 0.25, "grad_norm": 2.50333582908066, "kl": 0.1297426074743271, "learning_rate": 4.775675848247427e-07, "loss": 0.0687, "num_tokens": 43641392.0, "reward": 0.53125, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6577258110046387, "sampling/importance_sampling_ratio/mean": 0.999484658241272, "sampling/importance_sampling_ratio/min": 0.48239606618881226, "sampling/sampling_logp_difference/max": 0.7289897799491882, "sampling/sampling_logp_difference/mean": 0.01659483090043068, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 166.203125, "completions/mean_terminated_length": 166.203125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.24771378934383392, "epoch": 1.6936274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.3521541468766267, "kl": 0.1873389482498169, "learning_rate": 4.768559458657155e-07, "loss": -0.0039, "num_tokens": 43667037.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994951486587524, "sampling/importance_sampling_ratio/min": 0.48710572719573975, "sampling/sampling_logp_difference/max": 0.7192740440368652, "sampling/sampling_logp_difference/mean": 0.015180293470621109, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.28814393281936646, "epoch": 1.6948529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 2.2128747355978162, "kl": 0.09616734087467194, "learning_rate": 4.7614435388765203e-07, "loss": 0.0534, "num_tokens": 43703829.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9280849695205688, "sampling/importance_sampling_ratio/mean": 1.0000053644180298, "sampling/importance_sampling_ratio/min": 0.22572924196720123, "sampling/sampling_logp_difference/max": 1.4884190559387207, "sampling/sampling_logp_difference/mean": 0.01692255213856697, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 144.046875, "completions/mean_terminated_length": 144.046875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2953818440437317, "epoch": 1.696078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 2.204259061567527, "kl": 0.11235234141349792, "learning_rate": 4.7543281033503885e-07, "loss": 0.0196, "num_tokens": 43732008.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.7009391784667969, "sampling/importance_sampling_ratio/mean": 0.9996429085731506, "sampling/importance_sampling_ratio/min": 0.6176415085792542, "sampling/sampling_logp_difference/max": 0.5311806201934814, "sampling/sampling_logp_difference/mean": 0.017398793250322342, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.25864797830581665, "epoch": 1.6973039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.2438733835222997, "kl": 0.06752866506576538, "learning_rate": 4.747213166522644e-07, "loss": -0.005, "num_tokens": 43768720.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.682944893836975, "sampling/importance_sampling_ratio/mean": 0.9999276399612427, "sampling/importance_sampling_ratio/min": 0.5483670830726624, "sampling/sampling_logp_difference/max": 0.6008102893829346, "sampling/sampling_logp_difference/mean": 0.012968155555427074, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 180.40625, "completions/mean_terminated_length": 180.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.270290732383728, "epoch": 1.6985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.04900553241285253, "kl": 0.1089608296751976, "learning_rate": 4.740098742836156e-07, "loss": 0.001, "num_tokens": 43794346.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5467870235443115, "sampling/importance_sampling_ratio/mean": 0.9996156692504883, "sampling/importance_sampling_ratio/min": 0.44426000118255615, "sampling/sampling_logp_difference/max": 0.8113453388214111, "sampling/sampling_logp_difference/mean": 0.013380180113017559, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 213.390625, "completions/mean_terminated_length": 213.390625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.31980079412460327, "epoch": 1.6997549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.8001131038347111, "kl": 0.10845265537500381, "learning_rate": 4.732984846732755e-07, "loss": 0.033, "num_tokens": 43826227.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6718604564666748, "sampling/importance_sampling_ratio/mean": 0.9998260736465454, "sampling/importance_sampling_ratio/min": 0.4855412542819977, "sampling/sampling_logp_difference/max": 0.7224910259246826, "sampling/sampling_logp_difference/mean": 0.015616693533957005, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 202.984375, "completions/mean_terminated_length": 202.984375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.1872464269399643, "epoch": 1.7009803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.04457015442774327, "kl": 0.08273518085479736, "learning_rate": 4.725871492653199e-07, "loss": 0.0008, "num_tokens": 43856914.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4354684352874756, "sampling/importance_sampling_ratio/mean": 0.9999713897705078, "sampling/importance_sampling_ratio/min": 0.6215065121650696, "sampling/sampling_logp_difference/max": 0.47560882568359375, "sampling/sampling_logp_difference/mean": 0.011172572150826454, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 189.546875, "completions/mean_terminated_length": 189.546875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.27226871252059937, "epoch": 1.7022058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.2768027253593581, "kl": 0.1160283237695694, "learning_rate": 4.718758695037149e-07, "loss": -0.0119, "num_tokens": 43884901.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.795386552810669, "sampling/importance_sampling_ratio/mean": 0.9995177984237671, "sampling/importance_sampling_ratio/min": 0.617138147354126, "sampling/sampling_logp_difference/max": 0.5852203369140625, "sampling/sampling_logp_difference/mean": 0.015012630261480808, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 204.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3289707899093628, "epoch": 1.7034313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.0254271834874007, "kl": 0.14430175721645355, "learning_rate": 4.7116464683231285e-07, "loss": -0.004, "num_tokens": 43920657.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8759516477584839, "sampling/importance_sampling_ratio/mean": 1.0001757144927979, "sampling/importance_sampling_ratio/min": 0.4629943072795868, "sampling/sampling_logp_difference/max": 0.7700405120849609, "sampling/sampling_logp_difference/mean": 0.01841399446129799, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.33329734206199646, "epoch": 1.704656862745098, "frac_reward_zero_std": 0.25, "grad_norm": 1.951807007220008, "kl": 0.13255050778388977, "learning_rate": 4.704534826948509e-07, "loss": 0.0051, "num_tokens": 43955569.0, "reward": 0.5625, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001225471496582, "sampling/importance_sampling_ratio/min": 0.41612565517425537, "sampling/sampling_logp_difference/max": 0.8767680525779724, "sampling/sampling_logp_difference/mean": 0.016625236719846725, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 157.890625, "completions/mean_terminated_length": 157.890625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.22026416659355164, "epoch": 1.7058823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.07321305838820764, "kl": 0.10328660905361176, "learning_rate": 4.6974237853494744e-07, "loss": 0.0009, "num_tokens": 43984746.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000003695487976, "sampling/importance_sampling_ratio/min": 0.33506107330322266, "sampling/sampling_logp_difference/max": 1.093442440032959, "sampling/sampling_logp_difference/mean": 0.01616033911705017, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.32607588171958923, "epoch": 1.7071078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 1.169039318542256, "kl": 0.16327345371246338, "learning_rate": 4.690313357960985e-07, "loss": -0.0089, "num_tokens": 44019010.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.8102972507476807, "sampling/importance_sampling_ratio/mean": 1.000331163406372, "sampling/importance_sampling_ratio/min": 0.21244746446609497, "sampling/sampling_logp_difference/max": 1.549060583114624, "sampling/sampling_logp_difference/mean": 0.015983667224645615, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 170.59375, "completions/mean_terminated_length": 170.59375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.30761396884918213, "epoch": 1.7083333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.6522475555481437, "kl": 0.14691162109375, "learning_rate": 4.68320355921676e-07, "loss": 0.0122, "num_tokens": 44047800.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8312194347381592, "sampling/importance_sampling_ratio/mean": 1.0005525350570679, "sampling/importance_sampling_ratio/min": 0.5328331589698792, "sampling/sampling_logp_difference/max": 0.6295468807220459, "sampling/sampling_logp_difference/mean": 0.015194841660559177, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 221.546875, "completions/mean_terminated_length": 221.546875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2581326961517334, "epoch": 1.7095588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.04480658682677194, "kl": 0.0881354808807373, "learning_rate": 4.67609440354924e-07, "loss": 0.0008, "num_tokens": 44082923.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007344722747803, "sampling/importance_sampling_ratio/mean": 1.0008583068847656, "sampling/importance_sampling_ratio/min": 0.5205962657928467, "sampling/sampling_logp_difference/max": 0.6527805328369141, "sampling/sampling_logp_difference/mean": 0.014604821801185608, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 261.296875, "completions/mean_terminated_length": 261.296875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4104100465774536, "epoch": 1.7107843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.2406398167479664, "kl": 0.10742566734552383, "learning_rate": 4.668985905389563e-07, "loss": 0.0187, "num_tokens": 44122366.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5020952224731445, "sampling/importance_sampling_ratio/mean": 1.0001171827316284, "sampling/importance_sampling_ratio/min": 0.6024896502494812, "sampling/sampling_logp_difference/max": 0.5066847801208496, "sampling/sampling_logp_difference/mean": 0.01636691950261593, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.27860528230667114, "epoch": 1.7120098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.1965616760044813, "kl": 0.0851714164018631, "learning_rate": 4.661878079167526e-07, "loss": 0.0204, "num_tokens": 44157814.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8656429052352905, "sampling/importance_sampling_ratio/mean": 0.9995005130767822, "sampling/importance_sampling_ratio/min": 0.48173046112060547, "sampling/sampling_logp_difference/max": 0.7303705215454102, "sampling/sampling_logp_difference/mean": 0.015480025671422482, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 204.171875, "completions/mean_terminated_length": 204.171875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.24089059233665466, "epoch": 1.7132352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 1.3616082743869733, "kl": 0.10055754333734512, "learning_rate": 4.6547709393115677e-07, "loss": 0.0101, "num_tokens": 44186033.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.601683497428894, "sampling/importance_sampling_ratio/mean": 0.9999518990516663, "sampling/importance_sampling_ratio/min": 0.4732785224914551, "sampling/sampling_logp_difference/max": 0.7480711936950684, "sampling/sampling_logp_difference/mean": 0.014110399410128593, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 168.640625, "completions/mean_terminated_length": 168.640625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3112737536430359, "epoch": 1.7144607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.5091896296688616, "kl": 0.11020766198635101, "learning_rate": 4.6476645002487295e-07, "loss": -0.0083, "num_tokens": 44219178.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5657960176467896, "sampling/importance_sampling_ratio/mean": 0.9997831583023071, "sampling/importance_sampling_ratio/min": 0.5919474959373474, "sampling/sampling_logp_difference/max": 0.5243372917175293, "sampling/sampling_logp_difference/mean": 0.017252368852496147, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 167.71875, "completions/mean_terminated_length": 167.71875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2800918221473694, "epoch": 1.715686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.3703025697659408, "kl": 0.1219969168305397, "learning_rate": 4.640558776404639e-07, "loss": 0.0302, "num_tokens": 44252744.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5375255346298218, "sampling/importance_sampling_ratio/mean": 0.9999685287475586, "sampling/importance_sampling_ratio/min": 0.24667833745479584, "sampling/sampling_logp_difference/max": 1.399670124053955, "sampling/sampling_logp_difference/mean": 0.015770550817251205, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 175.390625, "completions/mean_terminated_length": 175.390625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.27610814571380615, "epoch": 1.7169117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.7524755269519727, "kl": 0.12056620419025421, "learning_rate": 4.633453782203458e-07, "loss": -0.0351, "num_tokens": 44278529.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002723932266235, "sampling/importance_sampling_ratio/min": 0.3573823869228363, "sampling/sampling_logp_difference/max": 1.0289490222930908, "sampling/sampling_logp_difference/mean": 0.016771188005805016, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 175.140625, "completions/mean_terminated_length": 175.140625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3369879126548767, "epoch": 1.718137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830706028504374, "kl": 0.2310381531715393, "learning_rate": 4.626349532067879e-07, "loss": 0.0021, "num_tokens": 44308106.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.499764084815979, "sampling/importance_sampling_ratio/mean": 0.99969482421875, "sampling/importance_sampling_ratio/min": 0.6117817163467407, "sampling/sampling_logp_difference/max": 0.4913797378540039, "sampling/sampling_logp_difference/mean": 0.016985496506094933, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 141.84375, "completions/mean_terminated_length": 141.84375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.14914551377296448, "epoch": 1.719362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.11856309333521355, "kl": 0.1281770020723343, "learning_rate": 4.6192460404190793e-07, "loss": 0.0012, "num_tokens": 44334448.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7212589979171753, "sampling/importance_sampling_ratio/mean": 0.9995043277740479, "sampling/importance_sampling_ratio/min": 0.14807994663715363, "sampling/sampling_logp_difference/max": 1.9100029468536377, "sampling/sampling_logp_difference/mean": 0.010671250522136688, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 177.859375, "completions/mean_terminated_length": 177.859375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.35706669092178345, "epoch": 1.7205882352941178, "frac_reward_zero_std": 0.25, "grad_norm": 2.161484119754085, "kl": 0.1885259598493576, "learning_rate": 4.6121433216766935e-07, "loss": -0.0016, "num_tokens": 44365031.0, "reward": 0.71875, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7002179622650146, "sampling/importance_sampling_ratio/mean": 0.9996144771575928, "sampling/importance_sampling_ratio/min": 0.46483224630355835, "sampling/sampling_logp_difference/max": 0.7660787105560303, "sampling/sampling_logp_difference/mean": 0.016989562660455704, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 136.234375, "completions/mean_terminated_length": 136.234375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2509397268295288, "epoch": 1.721813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.279440526685593, "kl": 0.11184224486351013, "learning_rate": 4.605041390258794e-07, "loss": 0.0056, "num_tokens": 44391654.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4854940176010132, "sampling/importance_sampling_ratio/mean": 1.0001810789108276, "sampling/importance_sampling_ratio/min": 0.6172491908073425, "sampling/sampling_logp_difference/max": 0.4824824333190918, "sampling/sampling_logp_difference/mean": 0.013479331508278847, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 174.46875, "completions/mean_terminated_length": 174.46875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3067903518676758, "epoch": 1.7230392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.9470053509376697, "kl": 0.1597614288330078, "learning_rate": 4.5979402605818514e-07, "loss": -0.001, "num_tokens": 44421972.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7133095264434814, "sampling/importance_sampling_ratio/mean": 1.000095009803772, "sampling/importance_sampling_ratio/min": 0.4271281063556671, "sampling/sampling_logp_difference/max": 0.8506712913513184, "sampling/sampling_logp_difference/mean": 0.016327355057001114, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 149.140625, "completions/mean_terminated_length": 149.140625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.32089683413505554, "epoch": 1.7242647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.7019283557406544, "kl": 0.13303419947624207, "learning_rate": 4.5908399470607104e-07, "loss": -0.0008, "num_tokens": 44447981.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5467069149017334, "sampling/importance_sampling_ratio/mean": 1.0000979900360107, "sampling/importance_sampling_ratio/min": 0.4128660261631012, "sampling/sampling_logp_difference/max": 0.8846321105957031, "sampling/sampling_logp_difference/mean": 0.01682734489440918, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 198.296875, "completions/mean_terminated_length": 198.296875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.4076573848724365, "epoch": 1.7254901960784315, "frac_reward_zero_std": 0.0, "grad_norm": 2.77545471302357, "kl": 0.1400512307882309, "learning_rate": 4.5837404641085535e-07, "loss": -0.0141, "num_tokens": 44487152.0, "reward": 0.3125, "reward_std": 0.6707825064659119, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9102158546447754, "sampling/importance_sampling_ratio/mean": 1.0003080368041992, "sampling/importance_sampling_ratio/min": 0.36251986026763916, "sampling/sampling_logp_difference/max": 1.0146760940551758, "sampling/sampling_logp_difference/mean": 0.018194351345300674, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 210.78125, "completions/mean_terminated_length": 210.78125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2296508550643921, "epoch": 1.7267156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.108559590314233, "kl": 0.08380186557769775, "learning_rate": 4.576641826136884e-07, "loss": 0.0064, "num_tokens": 44519154.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00016188621521, "sampling/importance_sampling_ratio/min": 0.43257567286491394, "sampling/sampling_logp_difference/max": 0.8849180936813354, "sampling/sampling_logp_difference/mean": 0.01322566345334053, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 156.765625, "completions/mean_terminated_length": 156.765625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2926194369792938, "epoch": 1.7279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 3.6724582500652336, "kl": 0.15845778584480286, "learning_rate": 4.5695440475554864e-07, "loss": -0.0517, "num_tokens": 44546243.0, "reward": 0.5, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7805469036102295, "sampling/importance_sampling_ratio/mean": 1.0001507997512817, "sampling/importance_sampling_ratio/min": 0.6211308836936951, "sampling/sampling_logp_difference/max": 0.5769205093383789, "sampling/sampling_logp_difference/mean": 0.014656851068139076, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 158.6875, "completions/mean_terminated_length": 158.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3535138964653015, "epoch": 1.7291666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.13681855629173315, "kl": 0.1603984832763672, "learning_rate": 4.5624471427724036e-07, "loss": 0.0015, "num_tokens": 44570383.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744293928146362, "sampling/importance_sampling_ratio/mean": 0.9997444152832031, "sampling/importance_sampling_ratio/min": 0.6240845322608948, "sampling/sampling_logp_difference/max": 0.4714694023132324, "sampling/sampling_logp_difference/mean": 0.017064634710550308, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 157.359375, "completions/mean_terminated_length": 157.359375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.28453484177589417, "epoch": 1.7303921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.1375969874177505, "kl": 0.14199012517929077, "learning_rate": 4.5553511261939e-07, "loss": 0.0074, "num_tokens": 44599782.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.9123090505599976, "sampling/importance_sampling_ratio/mean": 1.000168800354004, "sampling/importance_sampling_ratio/min": 0.48103198409080505, "sampling/sampling_logp_difference/max": 0.7318215370178223, "sampling/sampling_logp_difference/mean": 0.015347521752119064, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 139.03125, "completions/mean_terminated_length": 139.03125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.27037662267684937, "epoch": 1.7316176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.7043442480997766, "kl": 0.17010244727134705, "learning_rate": 4.5482560122244407e-07, "loss": -0.0074, "num_tokens": 44623096.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6923563480377197, "sampling/importance_sampling_ratio/mean": 1.0002752542495728, "sampling/importance_sampling_ratio/min": 0.6042792797088623, "sampling/sampling_logp_difference/max": 0.5261218547821045, "sampling/sampling_logp_difference/mean": 0.015295255929231644, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 193.203125, "completions/mean_terminated_length": 193.203125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.271964967250824, "epoch": 1.732843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.3621892146942411, "kl": 0.10405793786048889, "learning_rate": 4.541161815266658e-07, "loss": 0.0561, "num_tokens": 44653477.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7014896869659424, "sampling/importance_sampling_ratio/mean": 1.000382900238037, "sampling/importance_sampling_ratio/min": 0.6019006371498108, "sampling/sampling_logp_difference/max": 0.5315041542053223, "sampling/sampling_logp_difference/mean": 0.014222925528883934, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 162.515625, "completions/mean_terminated_length": 162.515625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2344822883605957, "epoch": 1.7340686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.14854398978953112, "kl": 0.1217186450958252, "learning_rate": 4.534068549721324e-07, "loss": 0.0012, "num_tokens": 44678726.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5074125528335571, "sampling/importance_sampling_ratio/mean": 1.0000494718551636, "sampling/importance_sampling_ratio/min": 0.5466105937957764, "sampling/sampling_logp_difference/max": 0.6040186882019043, "sampling/sampling_logp_difference/mean": 0.014358972199261189, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2636425495147705, "epoch": 1.7352941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 1.4177361675699305, "kl": 0.11610788106918335, "learning_rate": 4.5269762299873144e-07, "loss": 0.0153, "num_tokens": 44711294.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.597176194190979, "sampling/importance_sampling_ratio/mean": 0.9999326467514038, "sampling/importance_sampling_ratio/min": 0.6056553721427917, "sampling/sampling_logp_difference/max": 0.5014441013336182, "sampling/sampling_logp_difference/mean": 0.014468722976744175, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 202.109375, "completions/mean_terminated_length": 202.109375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.32975006103515625, "epoch": 1.7365196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.4751448800671598, "kl": 0.21373483538627625, "learning_rate": 4.519884870461591e-07, "loss": -0.0795, "num_tokens": 44743829.0, "reward": 0.53125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5812915563583374, "sampling/importance_sampling_ratio/mean": 0.9993947148323059, "sampling/importance_sampling_ratio/min": 0.48101913928985596, "sampling/sampling_logp_difference/max": 0.7318482398986816, "sampling/sampling_logp_difference/mean": 0.015227858908474445, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 134.84375, "completions/mean_terminated_length": 134.84375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2488342523574829, "epoch": 1.7377450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.2362308456338915, "kl": 0.15685197710990906, "learning_rate": 4.512794485539165e-07, "loss": -0.0226, "num_tokens": 44766139.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7938346862792969, "sampling/importance_sampling_ratio/mean": 0.9995754957199097, "sampling/importance_sampling_ratio/min": 0.6154866218566895, "sampling/sampling_logp_difference/max": 0.5843555927276611, "sampling/sampling_logp_difference/mean": 0.012182684615254402, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3114086389541626, "epoch": 1.7389705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.9221166858957488, "kl": 0.13919496536254883, "learning_rate": 4.505705089613068e-07, "loss": -0.0021, "num_tokens": 44792923.0, "reward": 0.71875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999337196350098, "sampling/importance_sampling_ratio/min": 0.6188860535621643, "sampling/sampling_logp_difference/max": 0.7540798187255859, "sampling/sampling_logp_difference/mean": 0.01507820375263691, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 140.96875, "completions/mean_terminated_length": 140.96875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2667887210845947, "epoch": 1.7401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.06858234105509331, "kl": 0.14108605682849884, "learning_rate": 4.4986166970743233e-07, "loss": 0.0015, "num_tokens": 44816153.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3518649339675903, "sampling/importance_sampling_ratio/mean": 1.0000407695770264, "sampling/importance_sampling_ratio/min": 0.49824461340904236, "sampling/sampling_logp_difference/max": 0.6966640949249268, "sampling/sampling_logp_difference/mean": 0.01382505428045988, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 168.203125, "completions/mean_terminated_length": 168.203125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3003990650177002, "epoch": 1.741421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 2.128264659039382, "kl": 0.15019284188747406, "learning_rate": 4.4915293223119205e-07, "loss": -0.0544, "num_tokens": 44842950.0, "reward": 0.125, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5724730491638184, "sampling/importance_sampling_ratio/mean": 1.0003362894058228, "sampling/importance_sampling_ratio/min": 0.5630784630775452, "sampling/sampling_logp_difference/max": 0.5743362903594971, "sampling/sampling_logp_difference/mean": 0.015519457869231701, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.40841609239578247, "epoch": 1.7426470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 2.3376997319413864, "kl": 0.17027829587459564, "learning_rate": 4.484442979712783e-07, "loss": 0.0297, "num_tokens": 44876934.0, "reward": 0.6875, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.7583552598953247, "sampling/importance_sampling_ratio/mean": 1.0000131130218506, "sampling/importance_sampling_ratio/min": 0.4725813865661621, "sampling/sampling_logp_difference/max": 0.7495453357696533, "sampling/sampling_logp_difference/mean": 0.019672289490699768, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 129.578125, "completions/mean_terminated_length": 129.578125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.24084897339344025, "epoch": 1.7438725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 2.3901074466322845, "kl": 0.17182883620262146, "learning_rate": 4.477357683661733e-07, "loss": 0.0176, "num_tokens": 44899499.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.546775221824646, "sampling/importance_sampling_ratio/mean": 0.9997415542602539, "sampling/importance_sampling_ratio/min": 0.6130619645118713, "sampling/sampling_logp_difference/max": 0.4892892837524414, "sampling/sampling_logp_difference/mean": 0.012741761282086372, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 173.28125, "completions/mean_terminated_length": 173.28125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.28197795152664185, "epoch": 1.7450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.05537464355682766, "kl": 0.16134506464004517, "learning_rate": 4.470273448541475e-07, "loss": 0.0015, "num_tokens": 44924973.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6356489658355713, "sampling/importance_sampling_ratio/mean": 0.9994578957557678, "sampling/importance_sampling_ratio/min": 0.4031654894351959, "sampling/sampling_logp_difference/max": 0.9084081649780273, "sampling/sampling_logp_difference/mean": 0.015645436942577362, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.30695268511772156, "epoch": 1.7463235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.4887156549415028, "kl": 0.12615860998630524, "learning_rate": 4.4631902887325567e-07, "loss": -0.016, "num_tokens": 44960245.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.000196099281311, "sampling/importance_sampling_ratio/min": 0.5263950228691101, "sampling/sampling_logp_difference/max": 0.6417033672332764, "sampling/sampling_logp_difference/mean": 0.017607446759939194, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 192.515625, "completions/mean_terminated_length": 192.515625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2785497307777405, "epoch": 1.7475490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.3593862911315862, "kl": 0.116864413022995, "learning_rate": 4.4561082186133456e-07, "loss": 0.0255, "num_tokens": 44985142.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5012266635894775, "sampling/importance_sampling_ratio/mean": 1.00004243850708, "sampling/importance_sampling_ratio/min": 0.6378768682479858, "sampling/sampling_logp_difference/max": 0.44960999488830566, "sampling/sampling_logp_difference/mean": 0.014556299895048141, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 202.15625, "completions/mean_terminated_length": 202.15625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2945750951766968, "epoch": 1.7487745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 1.8847566894250378, "kl": 0.10302512347698212, "learning_rate": 4.4490272525599936e-07, "loss": -0.0613, "num_tokens": 45018288.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6132599115371704, "sampling/importance_sampling_ratio/mean": 1.0001158714294434, "sampling/importance_sampling_ratio/min": 0.5182479619979858, "sampling/sampling_logp_difference/max": 0.6573014259338379, "sampling/sampling_logp_difference/mean": 0.014942665584385395, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 171.671875, "completions/mean_terminated_length": 171.671875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2976667284965515, "epoch": 1.75, "frac_reward_zero_std": 0.5, "grad_norm": 1.687040315389125, "kl": 0.15053917467594147, "learning_rate": 4.4419474049464135e-07, "loss": -0.0284, "num_tokens": 45044187.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5078672170639038, "sampling/importance_sampling_ratio/mean": 1.0005179643630981, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.014646529220044613, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 186.984375, "completions/mean_terminated_length": 186.984375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.24708235263824463, "epoch": 1.7512254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.061962294927390936, "kl": 0.13475078344345093, "learning_rate": 4.43486869014425e-07, "loss": 0.0013, "num_tokens": 45077210.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.609244704246521, "sampling/importance_sampling_ratio/mean": 0.9998207688331604, "sampling/importance_sampling_ratio/min": 0.47399142384529114, "sampling/sampling_logp_difference/max": 0.7465660572052002, "sampling/sampling_logp_difference/mean": 0.013482634909451008, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 202.765625, "completions/mean_terminated_length": 202.765625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.22805166244506836, "epoch": 1.7524509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.3470476320260578, "kl": 0.06614042818546295, "learning_rate": 4.427791122522841e-07, "loss": 0.0056, "num_tokens": 45116507.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8078150749206543, "sampling/importance_sampling_ratio/mean": 0.9995851516723633, "sampling/importance_sampling_ratio/min": 0.36503711342811584, "sampling/sampling_logp_difference/max": 1.007756233215332, "sampling/sampling_logp_difference/mean": 0.014406598173081875, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 208.5, "completions/mean_terminated_length": 208.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.28755977749824524, "epoch": 1.7536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 2.0000316963571345, "kl": 0.11514320969581604, "learning_rate": 4.420714716449203e-07, "loss": -0.0192, "num_tokens": 45146779.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998273253440857, "sampling/importance_sampling_ratio/min": 0.4000128507614136, "sampling/sampling_logp_difference/max": 0.9162585735321045, "sampling/sampling_logp_difference/mean": 0.016624003648757935, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 205.0625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.29105067253112793, "epoch": 1.7549019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.7683078310477278, "kl": 0.11758530139923096, "learning_rate": 4.413639486287991e-07, "loss": 0.0201, "num_tokens": 45178447.0, "reward": 0.3125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9833096265792847, "sampling/importance_sampling_ratio/mean": 0.9991083145141602, "sampling/importance_sampling_ratio/min": 0.6146351099014282, "sampling/sampling_logp_difference/max": 0.6847670078277588, "sampling/sampling_logp_difference/mean": 0.014459026977419853, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 199.328125, "completions/mean_terminated_length": 199.328125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.21505844593048096, "epoch": 1.7561274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.041452161555423174, "kl": 0.07256513088941574, "learning_rate": 4.406565446401476e-07, "loss": 0.0007, "num_tokens": 45207076.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6253396272659302, "sampling/importance_sampling_ratio/mean": 0.9993472099304199, "sampling/importance_sampling_ratio/min": 0.5857216119766235, "sampling/sampling_logp_difference/max": 0.5349106788635254, "sampling/sampling_logp_difference/mean": 0.012444807216525078, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 205.359375, "completions/mean_terminated_length": 205.359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2595762312412262, "epoch": 1.7573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.052811280893990606, "kl": 0.08328672498464584, "learning_rate": 4.399492611149509e-07, "loss": 0.0008, "num_tokens": 45237019.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6590856313705444, "sampling/importance_sampling_ratio/mean": 0.9994620680809021, "sampling/importance_sampling_ratio/min": 0.5040537714958191, "sampling/sampling_logp_difference/max": 0.6850724220275879, "sampling/sampling_logp_difference/mean": 0.014014007523655891, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 219.6875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.36765241622924805, "epoch": 1.758578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.5545189590979627, "kl": 0.10093101859092712, "learning_rate": 4.392420994889498e-07, "loss": 0.0313, "num_tokens": 45267895.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.667101502418518, "sampling/importance_sampling_ratio/mean": 1.000354290008545, "sampling/importance_sampling_ratio/min": 0.6023568511009216, "sampling/sampling_logp_difference/max": 0.5110864639282227, "sampling/sampling_logp_difference/mean": 0.01743512973189354, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 164.53125, "completions/mean_terminated_length": 164.53125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18837663531303406, "epoch": 1.7598039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.05479383899943296, "kl": 0.0949767529964447, "learning_rate": 4.385350611976376e-07, "loss": 0.0009, "num_tokens": 45293993.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6428558826446533, "sampling/importance_sampling_ratio/mean": 0.999722421169281, "sampling/importance_sampling_ratio/min": 0.5566846132278442, "sampling/sampling_logp_difference/max": 0.5857564210891724, "sampling/sampling_logp_difference/mean": 0.01386752724647522, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 178.765625, "completions/mean_terminated_length": 178.765625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3317575752735138, "epoch": 1.7610294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 2.8074945467788956, "kl": 0.135996013879776, "learning_rate": 4.3782814767625755e-07, "loss": -0.0746, "num_tokens": 45323482.0, "reward": -0.375, "reward_std": 0.6681214570999146, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8816889524459839, "sampling/importance_sampling_ratio/mean": 1.0004677772521973, "sampling/importance_sampling_ratio/min": 0.6718875169754028, "sampling/sampling_logp_difference/max": 0.6321697235107422, "sampling/sampling_logp_difference/mean": 0.016171332448720932, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.32487937808036804, "epoch": 1.7622549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.3665938780035523, "kl": 0.13701051473617554, "learning_rate": 4.371213603597987e-07, "loss": 0.0242, "num_tokens": 45356570.0, "reward": 0.40625, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000591278076172, "sampling/importance_sampling_ratio/min": 0.6603941917419434, "sampling/sampling_logp_difference/max": 0.7689223289489746, "sampling/sampling_logp_difference/mean": 0.014964740723371506, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 224.234375, "completions/mean_terminated_length": 224.234375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3308764696121216, "epoch": 1.7634803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.5102491486744842, "kl": 0.1029498428106308, "learning_rate": 4.3641470068299483e-07, "loss": -0.0091, "num_tokens": 45396089.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997514486312866, "sampling/importance_sampling_ratio/min": 0.27851998805999756, "sampling/sampling_logp_difference/max": 1.434424877166748, "sampling/sampling_logp_difference/mean": 0.017084071412682533, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 240.765625, "completions/mean_terminated_length": 240.765625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.330849289894104, "epoch": 1.7647058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.0874013950583177, "kl": 0.10099180042743683, "learning_rate": 4.3570817008032044e-07, "loss": -0.0211, "num_tokens": 45428874.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6062994003295898, "sampling/importance_sampling_ratio/mean": 1.0001918077468872, "sampling/importance_sampling_ratio/min": 0.25758519768714905, "sampling/sampling_logp_difference/max": 1.3564047813415527, "sampling/sampling_logp_difference/mean": 0.015466933138668537, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 141.34375, "completions/mean_terminated_length": 141.34375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.21416430175304413, "epoch": 1.7659313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.5186219349172438, "kl": 0.11881053447723389, "learning_rate": 4.350017699859877e-07, "loss": 0.0205, "num_tokens": 45450800.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5538417100906372, "sampling/importance_sampling_ratio/mean": 0.9992287158966064, "sampling/importance_sampling_ratio/min": 0.6020976305007935, "sampling/sampling_logp_difference/max": 0.5073356628417969, "sampling/sampling_logp_difference/mean": 0.013720016926527023, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 232.84375, "completions/mean_terminated_length": 232.84375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3054463863372803, "epoch": 1.767156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.3211377376294955, "kl": 0.11160194873809814, "learning_rate": 4.342955018339441e-07, "loss": 0.0844, "num_tokens": 45481910.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9497469663619995, "sampling/importance_sampling_ratio/mean": 1.0000035762786865, "sampling/importance_sampling_ratio/min": 0.5904914140701294, "sampling/sampling_logp_difference/max": 0.6676995754241943, "sampling/sampling_logp_difference/mean": 0.014185999520123005, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 266.09375, "completions/mean_terminated_length": 266.09375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3187551200389862, "epoch": 1.7683823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.5081586476203768, "kl": 0.11256249248981476, "learning_rate": 4.335893670578694e-07, "loss": 0.0353, "num_tokens": 45520380.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996122717857361, "sampling/importance_sampling_ratio/min": 0.46971988677978516, "sampling/sampling_logp_difference/max": 0.7696003913879395, "sampling/sampling_logp_difference/mean": 0.016480494290590286, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 166.671875, "completions/mean_terminated_length": 166.671875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.21188709139823914, "epoch": 1.7696078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.11147503156738692, "kl": 0.10149864852428436, "learning_rate": 4.328833670911724e-07, "loss": 0.001, "num_tokens": 45546167.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7319319248199463, "sampling/importance_sampling_ratio/mean": 0.999884307384491, "sampling/importance_sampling_ratio/min": 0.6386851668357849, "sampling/sampling_logp_difference/max": 0.5492374897003174, "sampling/sampling_logp_difference/mean": 0.012810531072318554, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 185.21875, "completions/mean_terminated_length": 185.21875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2641652226448059, "epoch": 1.7708333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.3653630721501335, "kl": 0.10343880951404572, "learning_rate": 4.3217750336698803e-07, "loss": -0.0101, "num_tokens": 45571989.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998292326927185, "sampling/importance_sampling_ratio/min": 0.4542100131511688, "sampling/sampling_logp_difference/max": 0.8377575874328613, "sampling/sampling_logp_difference/mean": 0.014748496934771538, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 165.390625, "completions/mean_terminated_length": 165.390625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.2973984181880951, "epoch": 1.7720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.05266866935891322, "kl": 0.11347492039203644, "learning_rate": 4.314717773181752e-07, "loss": 0.0011, "num_tokens": 45601422.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5784883499145508, "sampling/importance_sampling_ratio/mean": 0.9995951652526855, "sampling/importance_sampling_ratio/min": 0.6271665096282959, "sampling/sampling_logp_difference/max": 0.46654319763183594, "sampling/sampling_logp_difference/mean": 0.015134399756789207, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 180.484375, "completions/mean_terminated_length": 180.484375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.29683828353881836, "epoch": 1.7732843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.341287814484405, "kl": 0.11379870027303696, "learning_rate": 4.3076619037731287e-07, "loss": 0.0465, "num_tokens": 45628669.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5927168130874634, "sampling/importance_sampling_ratio/mean": 0.9999488592147827, "sampling/importance_sampling_ratio/min": 0.6072784662246704, "sampling/sampling_logp_difference/max": 0.4987678527832031, "sampling/sampling_logp_difference/mean": 0.015673775225877762, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 287.046875, "completions/mean_terminated_length": 287.046875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3124946355819702, "epoch": 1.7745098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.2691119800818518, "kl": 0.11259143054485321, "learning_rate": 4.3006074397669836e-07, "loss": 0.0939, "num_tokens": 45667120.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001816749572754, "sampling/importance_sampling_ratio/min": 0.3952656090259552, "sampling/sampling_logp_difference/max": 0.9281973838806152, "sampling/sampling_logp_difference/mean": 0.014476969838142395, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 244.390625, "completions/mean_terminated_length": 244.390625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3101966977119446, "epoch": 1.7757352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.03819452015546175, "kl": 0.08543956279754639, "learning_rate": 4.293554395483425e-07, "loss": 0.0008, "num_tokens": 45705913.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6706359386444092, "sampling/importance_sampling_ratio/mean": 1.0001120567321777, "sampling/importance_sampling_ratio/min": 0.4687845706939697, "sampling/sampling_logp_difference/max": 0.7576119899749756, "sampling/sampling_logp_difference/mean": 0.016486328095197678, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 291.453125, "completions/mean_terminated_length": 291.453125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2985606789588928, "epoch": 1.7769607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.915618789408616, "kl": 0.08149291574954987, "learning_rate": 4.2865027852396894e-07, "loss": -0.0011, "num_tokens": 45744630.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7755229473114014, "sampling/importance_sampling_ratio/mean": 0.9995817542076111, "sampling/importance_sampling_ratio/min": 0.5869686603546143, "sampling/sampling_logp_difference/max": 0.5740950107574463, "sampling/sampling_logp_difference/mean": 0.014272743836045265, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 165.359375, "completions/mean_terminated_length": 165.359375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.26212406158447266, "epoch": 1.778186274509804, "frac_reward_zero_std": 0.5, "grad_norm": 2.291255588472374, "kl": 0.18543842434883118, "learning_rate": 4.2794526233501004e-07, "loss": 0.0256, "num_tokens": 45769853.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004011392593384, "sampling/importance_sampling_ratio/min": 0.5027245283126831, "sampling/sampling_logp_difference/max": 0.7940542697906494, "sampling/sampling_logp_difference/mean": 0.013519938103854656, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 214.671875, "completions/mean_terminated_length": 214.671875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3533676862716675, "epoch": 1.7794117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.08780767237267713, "kl": 0.13456638157367706, "learning_rate": 4.272403924126035e-07, "loss": 0.0013, "num_tokens": 45801352.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5818028450012207, "sampling/importance_sampling_ratio/mean": 1.00015389919281, "sampling/importance_sampling_ratio/min": 0.6185346841812134, "sampling/sampling_logp_difference/max": 0.48040199279785156, "sampling/sampling_logp_difference/mean": 0.016733458265662193, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 215.515625, "completions/mean_terminated_length": 215.515625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.31544774770736694, "epoch": 1.780637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.75756718326626, "kl": 0.1051800325512886, "learning_rate": 4.2653567018759103e-07, "loss": 0.0239, "num_tokens": 45836313.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994698166847229, "sampling/importance_sampling_ratio/min": 0.5134934186935425, "sampling/sampling_logp_difference/max": 1.0599148273468018, "sampling/sampling_logp_difference/mean": 0.016565721482038498, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2523324489593506, "epoch": 1.781862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.9300216698323395, "kl": 0.07508465647697449, "learning_rate": 4.258310970905139e-07, "loss": -0.0007, "num_tokens": 45875193.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6011593341827393, "sampling/importance_sampling_ratio/mean": 1.0002411603927612, "sampling/importance_sampling_ratio/min": 0.6262943148612976, "sampling/sampling_logp_difference/max": 0.47072792053222656, "sampling/sampling_logp_difference/mean": 0.012446523644030094, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 239.578125, "completions/mean_terminated_length": 239.578125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3384774923324585, "epoch": 1.7830882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.05992620510387595, "kl": 0.09544166922569275, "learning_rate": 4.251266745516112e-07, "loss": 0.001, "num_tokens": 45913806.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5321756601333618, "sampling/importance_sampling_ratio/mean": 1.0001329183578491, "sampling/importance_sampling_ratio/min": 0.5305793881416321, "sampling/sampling_logp_difference/max": 0.6337857246398926, "sampling/sampling_logp_difference/mean": 0.015669850632548332, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 182.453125, "completions/mean_terminated_length": 182.453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2476593255996704, "epoch": 1.784313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.1933820000353008, "kl": 0.12715579569339752, "learning_rate": 4.2442240400081556e-07, "loss": -0.0053, "num_tokens": 45944635.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.435200810432434, "sampling/importance_sampling_ratio/mean": 1.0001413822174072, "sampling/importance_sampling_ratio/min": 0.44347378611564636, "sampling/sampling_logp_difference/max": 0.8131165504455566, "sampling/sampling_logp_difference/mean": 0.013275878503918648, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 249.453125, "completions/mean_terminated_length": 249.453125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.19521364569664001, "epoch": 1.7855392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.1393297829970839, "kl": 0.07648970186710358, "learning_rate": 4.2371828686775186e-07, "loss": 0.0046, "num_tokens": 45981368.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6007364988327026, "sampling/importance_sampling_ratio/mean": 1.0003199577331543, "sampling/importance_sampling_ratio/min": 0.5038032531738281, "sampling/sampling_logp_difference/max": 0.6855695247650146, "sampling/sampling_logp_difference/mean": 0.01115436665713787, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 200.34375, "completions/mean_terminated_length": 200.34375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2711257338523865, "epoch": 1.7867647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.6282354524255949, "kl": 0.1633697897195816, "learning_rate": 4.2301432458173316e-07, "loss": -0.0459, "num_tokens": 46006734.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.730678677558899, "sampling/importance_sampling_ratio/mean": 1.0005601644515991, "sampling/importance_sampling_ratio/min": 0.5676622986793518, "sampling/sampling_logp_difference/max": 0.5662285089492798, "sampling/sampling_logp_difference/mean": 0.014953596517443657, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3045671582221985, "epoch": 1.7879901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 2.36006326814763, "kl": 0.205718532204628, "learning_rate": 4.223105185717585e-07, "loss": 0.0279, "num_tokens": 46032982.0, "reward": -0.125, "reward_std": 0.6708203554153442, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6023335456848145, "sampling/importance_sampling_ratio/mean": 0.999557375907898, "sampling/importance_sampling_ratio/min": 0.6203153133392334, "sampling/sampling_logp_difference/max": 0.477527379989624, "sampling/sampling_logp_difference/mean": 0.015139946714043617, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 200.921875, "completions/mean_terminated_length": 200.921875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.27441561222076416, "epoch": 1.7892156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.4234743173928497, "kl": 0.10259873420000076, "learning_rate": 4.216068702665093e-07, "loss": 0.0328, "num_tokens": 46063249.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7136951684951782, "sampling/importance_sampling_ratio/mean": 1.0002691745758057, "sampling/importance_sampling_ratio/min": 0.4836650490760803, "sampling/sampling_logp_difference/max": 0.7263627052307129, "sampling/sampling_logp_difference/mean": 0.015271389856934547, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 232.171875, "completions/mean_terminated_length": 232.171875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.23431970179080963, "epoch": 1.7904411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.8919099955040428, "kl": 0.07990675419569016, "learning_rate": 4.2090338109434703e-07, "loss": -0.0099, "num_tokens": 46099852.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5862653255462646, "sampling/importance_sampling_ratio/mean": 0.9999886155128479, "sampling/importance_sampling_ratio/min": 0.5498828291893005, "sampling/sampling_logp_difference/max": 0.5980501174926758, "sampling/sampling_logp_difference/mean": 0.014020893722772598, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 189.625, "completions/mean_terminated_length": 189.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.1814536303281784, "epoch": 1.7916666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.04358499048492709, "kl": 0.06934978812932968, "learning_rate": 4.202000524833105e-07, "loss": 0.0007, "num_tokens": 46132708.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4989718198776245, "sampling/importance_sampling_ratio/mean": 0.9997729063034058, "sampling/importance_sampling_ratio/min": 0.5376615524291992, "sampling/sampling_logp_difference/max": 0.6205259561538696, "sampling/sampling_logp_difference/mean": 0.011484737507998943, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 218.359375, "completions/mean_terminated_length": 218.359375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3579706847667694, "epoch": 1.7928921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.2533118260163254, "kl": 0.14592593908309937, "learning_rate": 4.194968858611117e-07, "loss": 0.0016, "num_tokens": 46165419.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.623975396156311, "sampling/importance_sampling_ratio/mean": 0.9996335506439209, "sampling/importance_sampling_ratio/min": 0.5625147223472595, "sampling/sampling_logp_difference/max": 0.5753380060195923, "sampling/sampling_logp_difference/mean": 0.016157599166035652, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 308.015625, "completions/mean_terminated_length": 308.015625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.264693021774292, "epoch": 1.7941176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.9742286420006561, "kl": 0.06580743193626404, "learning_rate": 4.187938826551346e-07, "loss": -0.0001, "num_tokens": 46212268.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995682835578918, "sampling/importance_sampling_ratio/min": 0.39027827978134155, "sampling/sampling_logp_difference/max": 0.9408953189849854, "sampling/sampling_logp_difference/mean": 0.01413442101329565, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 162.296875, "completions/mean_terminated_length": 162.296875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.24900653958320618, "epoch": 1.795343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.7551959821952114, "kl": 0.13092195987701416, "learning_rate": 4.180910442924311e-07, "loss": -0.0141, "num_tokens": 46236735.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7444039583206177, "sampling/importance_sampling_ratio/mean": 1.0002326965332031, "sampling/importance_sampling_ratio/min": 0.6056519150733948, "sampling/sampling_logp_difference/max": 0.556412935256958, "sampling/sampling_logp_difference/mean": 0.013616163283586502, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 205.296875, "completions/mean_terminated_length": 205.296875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.21178001165390015, "epoch": 1.7965686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.04014984455842662, "kl": 0.07349078357219696, "learning_rate": 4.173883721997188e-07, "loss": 0.0007, "num_tokens": 46272050.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9960434436798096, "sampling/importance_sampling_ratio/mean": 1.0001407861709595, "sampling/importance_sampling_ratio/min": 0.532551646232605, "sampling/sampling_logp_difference/max": 0.691166877746582, "sampling/sampling_logp_difference/mean": 0.012803494930267334, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 221.796875, "completions/mean_terminated_length": 221.796875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2358531504869461, "epoch": 1.7977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.04191345470477028, "kl": 0.06782007962465286, "learning_rate": 4.1668586780337713e-07, "loss": 0.0007, "num_tokens": 46300501.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5745536088943481, "sampling/importance_sampling_ratio/mean": 1.00028395652771, "sampling/importance_sampling_ratio/min": 0.4825882315635681, "sampling/sampling_logp_difference/max": 0.7285915613174438, "sampling/sampling_logp_difference/mean": 0.013855335302650928, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2522590458393097, "epoch": 1.7990196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.5970123198131199, "kl": 0.13570408523082733, "learning_rate": 4.159835325294457e-07, "loss": -0.0156, "num_tokens": 46327229.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.9506787061691284, "sampling/importance_sampling_ratio/mean": 0.9993472099304199, "sampling/importance_sampling_ratio/min": 0.5643655061721802, "sampling/sampling_logp_difference/max": 0.6681773662567139, "sampling/sampling_logp_difference/mean": 0.013971049338579178, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 160.65625, "completions/mean_terminated_length": 160.65625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2732565402984619, "epoch": 1.8002450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.1604348819027174, "kl": 0.1681511104106903, "learning_rate": 4.152813678036208e-07, "loss": 0.0015, "num_tokens": 46356087.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995669722557068, "sampling/importance_sampling_ratio/min": 0.5160248875617981, "sampling/sampling_logp_difference/max": 1.227365493774414, "sampling/sampling_logp_difference/mean": 0.01550702191889286, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 217.234375, "completions/mean_terminated_length": 217.234375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3137251138687134, "epoch": 1.8014705882352942, "frac_reward_zero_std": 0.25, "grad_norm": 1.961620072047609, "kl": 0.13955897092819214, "learning_rate": 4.145793750512522e-07, "loss": -0.0159, "num_tokens": 46386198.0, "reward": -0.3125, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.488340973854065, "sampling/importance_sampling_ratio/mean": 0.9991217255592346, "sampling/importance_sampling_ratio/min": 0.4871235191822052, "sampling/sampling_logp_difference/max": 0.7192375659942627, "sampling/sampling_logp_difference/mean": 0.01649576798081398, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.27667832374572754, "epoch": 1.8026960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.6599879293237676, "kl": 0.1454058289527893, "learning_rate": 4.1387755569734054e-07, "loss": -0.002, "num_tokens": 46417110.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000545620918274, "sampling/importance_sampling_ratio/min": 0.4651028513908386, "sampling/sampling_logp_difference/max": 0.9523067474365234, "sampling/sampling_logp_difference/mean": 0.015224859118461609, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 202.8125, "completions/mean_terminated_length": 202.8125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.29587581753730774, "epoch": 1.803921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9993185187123691, "kl": 0.1164996549487114, "learning_rate": 4.131759111665348e-07, "loss": 0.0112, "num_tokens": 46451050.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5471752882003784, "sampling/importance_sampling_ratio/mean": 1.0005613565444946, "sampling/importance_sampling_ratio/min": 0.5380123257637024, "sampling/sampling_logp_difference/max": 0.6198737621307373, "sampling/sampling_logp_difference/mean": 0.016710273921489716, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 217.140625, "completions/mean_terminated_length": 217.140625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.31315773725509644, "epoch": 1.8051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.7123886413960567, "kl": 0.1587759405374527, "learning_rate": 4.1247444288312895e-07, "loss": -0.0008, "num_tokens": 46482403.0, "reward": -0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996099472045898, "sampling/importance_sampling_ratio/min": 0.3785988986492157, "sampling/sampling_logp_difference/max": 2.326936721801758, "sampling/sampling_logp_difference/mean": 0.017582248896360397, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 294.796875, "completions/mean_terminated_length": 294.796875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.41033393144607544, "epoch": 1.8063725490196079, "frac_reward_zero_std": 0.25, "grad_norm": 1.4855582911495697, "kl": 0.13153117895126343, "learning_rate": 4.1177315227105926e-07, "loss": 0.0051, "num_tokens": 46522150.0, "reward": 0.28125, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.714239239692688, "sampling/importance_sampling_ratio/mean": 0.9994593858718872, "sampling/importance_sampling_ratio/min": 0.6069623231887817, "sampling/sampling_logp_difference/max": 0.5389693975448608, "sampling/sampling_logp_difference/mean": 0.018105637282133102, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 222.171875, "completions/mean_terminated_length": 222.171875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2713545560836792, "epoch": 1.8075980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.09772615448301242, "kl": 0.12450879067182541, "learning_rate": 4.1107204075390096e-07, "loss": 0.0012, "num_tokens": 46549713.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.0002514123916626, "sampling/importance_sampling_ratio/min": 0.5910095572471619, "sampling/sampling_logp_difference/max": 0.5607883930206299, "sampling/sampling_logp_difference/mean": 0.014145957306027412, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 213.859375, "completions/mean_terminated_length": 213.859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24778100848197937, "epoch": 1.8088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.3635704139500446, "kl": 0.07690733671188354, "learning_rate": 4.1037110975486617e-07, "loss": 0.028, "num_tokens": 46580568.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.83034086227417, "sampling/importance_sampling_ratio/mean": 1.0001349449157715, "sampling/importance_sampling_ratio/min": 0.5244014859199524, "sampling/sampling_logp_difference/max": 0.6454976797103882, "sampling/sampling_logp_difference/mean": 0.01386608649045229, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 270.734375, "completions/mean_terminated_length": 270.734375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2906627655029297, "epoch": 1.8100490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 1.351512616836499, "kl": 0.07863107323646545, "learning_rate": 4.096703606968006e-07, "loss": 0.063, "num_tokens": 46615495.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6088883876800537, "sampling/importance_sampling_ratio/mean": 0.9999545216560364, "sampling/importance_sampling_ratio/min": 0.41614967584609985, "sampling/sampling_logp_difference/max": 0.8767102956771851, "sampling/sampling_logp_difference/mean": 0.013265259563922882, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 289.578125, "completions/mean_terminated_length": 289.578125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3211671710014343, "epoch": 1.8112745098039216, "frac_reward_zero_std": 0.25, "grad_norm": 1.4997630477810504, "kl": 0.10330028831958771, "learning_rate": 4.0896979500218014e-07, "loss": -0.0384, "num_tokens": 46659532.0, "reward": 0.125, "reward_std": 0.6047805547714233, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998058080673218, "sampling/importance_sampling_ratio/min": 0.5684040188789368, "sampling/sampling_logp_difference/max": 0.7329325675964355, "sampling/sampling_logp_difference/mean": 0.015124181285500526, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 249.078125, "completions/mean_terminated_length": 249.078125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23253154754638672, "epoch": 1.8125, "frac_reward_zero_std": 0.75, "grad_norm": 1.179699382372317, "kl": 0.08571634441614151, "learning_rate": 4.082694140931088e-07, "loss": -0.0072, "num_tokens": 46693073.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7971853017807007, "sampling/importance_sampling_ratio/mean": 0.999946117401123, "sampling/importance_sampling_ratio/min": 0.46659648418426514, "sampling/sampling_logp_difference/max": 0.7622904777526855, "sampling/sampling_logp_difference/mean": 0.012605142779648304, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 182.84375, "completions/mean_terminated_length": 182.84375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.27958163619041443, "epoch": 1.8137254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.05490407431691958, "kl": 0.1917632669210434, "learning_rate": 4.0756921939131563e-07, "loss": 0.0017, "num_tokens": 46721959.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8738871812820435, "sampling/importance_sampling_ratio/mean": 1.0003198385238647, "sampling/importance_sampling_ratio/min": 0.6499074101448059, "sampling/sampling_logp_difference/max": 0.6280150413513184, "sampling/sampling_logp_difference/mean": 0.014630744233727455, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 258.546875, "completions/mean_terminated_length": 258.546875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3771745562553406, "epoch": 1.8149509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.05803421229739693, "kl": 0.16859950125217438, "learning_rate": 4.0686921231815155e-07, "loss": 0.0016, "num_tokens": 46756570.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6955606937408447, "sampling/importance_sampling_ratio/mean": 0.9999762773513794, "sampling/importance_sampling_ratio/min": 0.5583487153053284, "sampling/sampling_logp_difference/max": 0.5827715396881104, "sampling/sampling_logp_difference/mean": 0.017337318509817123, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 198.78125, "completions/mean_terminated_length": 198.78125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2628799378871918, "epoch": 1.8161764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.04604941449462, "kl": 0.15804724395275116, "learning_rate": 4.0616939429458627e-07, "loss": 0.0333, "num_tokens": 46783628.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5277986526489258, "sampling/importance_sampling_ratio/mean": 0.9998676776885986, "sampling/importance_sampling_ratio/min": 0.6298384666442871, "sampling/sampling_logp_difference/max": 0.462291955947876, "sampling/sampling_logp_difference/mean": 0.014284651726484299, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 194.515625, "completions/mean_terminated_length": 194.515625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.2097713053226471, "epoch": 1.8174019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.05423994249844911, "kl": 0.08659006655216217, "learning_rate": 4.0546976674120623e-07, "loss": 0.0009, "num_tokens": 46814925.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.62943696975708, "sampling/importance_sampling_ratio/mean": 1.0001730918884277, "sampling/importance_sampling_ratio/min": 0.6562926173210144, "sampling/sampling_logp_difference/max": 0.4882345199584961, "sampling/sampling_logp_difference/mean": 0.011546341702342033, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 252.90625, "completions/mean_terminated_length": 252.90625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.35610705614089966, "epoch": 1.8186274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 1.628361087929497, "kl": 0.1090826690196991, "learning_rate": 4.047703310782111e-07, "loss": -0.0379, "num_tokens": 46854375.0, "reward": 0.28125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003490447998047, "sampling/importance_sampling_ratio/min": 0.4153086543083191, "sampling/sampling_logp_difference/max": 0.8787332773208618, "sampling/sampling_logp_difference/mean": 0.01803845912218094, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 233.453125, "completions/mean_terminated_length": 233.453125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2663761377334595, "epoch": 1.8198529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.9176760582059719, "kl": 0.09884761273860931, "learning_rate": 4.0407108872541105e-07, "loss": -0.009, "num_tokens": 46891204.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6473641395568848, "sampling/importance_sampling_ratio/mean": 1.0004463195800781, "sampling/importance_sampling_ratio/min": 0.6516190767288208, "sampling/sampling_logp_difference/max": 0.4991765022277832, "sampling/sampling_logp_difference/mean": 0.01306439470499754, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 195.03125, "completions/mean_terminated_length": 195.03125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.28954237699508667, "epoch": 1.821078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.8364595052336887, "kl": 0.1455627828836441, "learning_rate": 4.0337204110222347e-07, "loss": 0.0807, "num_tokens": 46923334.0, "reward": 0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9398034811019897, "sampling/importance_sampling_ratio/mean": 0.9994158744812012, "sampling/importance_sampling_ratio/min": 0.5024063587188721, "sampling/sampling_logp_difference/max": 0.6883460283279419, "sampling/sampling_logp_difference/mean": 0.015754956752061844, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 202.53125, "completions/mean_terminated_length": 202.53125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.23330318927764893, "epoch": 1.8223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.2342846455729133, "kl": 0.12264470756053925, "learning_rate": 4.0267318962767076e-07, "loss": 0.0136, "num_tokens": 46953992.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991086721420288, "sampling/importance_sampling_ratio/min": 0.5747461915016174, "sampling/sampling_logp_difference/max": 0.7172539234161377, "sampling/sampling_logp_difference/mean": 0.013632211834192276, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 178.40625, "completions/mean_terminated_length": 178.40625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2289418876171112, "epoch": 1.8235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.841502353638724, "kl": 0.18628448247909546, "learning_rate": 4.0197453572037747e-07, "loss": 0.0018, "num_tokens": 46983538.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001065731048584, "sampling/importance_sampling_ratio/min": 0.6073968410491943, "sampling/sampling_logp_difference/max": 0.754854679107666, "sampling/sampling_logp_difference/mean": 0.013173737563192844, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 205.859375, "completions/mean_terminated_length": 205.859375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.30589017271995544, "epoch": 1.8247549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 2.243124549402031, "kl": 0.12316029518842697, "learning_rate": 4.0127608079856644e-07, "loss": -0.1134, "num_tokens": 47011913.0, "reward": -0.4375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.7125898599624634, "sampling/importance_sampling_ratio/mean": 0.9999831914901733, "sampling/importance_sampling_ratio/min": 0.4982426166534424, "sampling/sampling_logp_difference/max": 0.6966681480407715, "sampling/sampling_logp_difference/mean": 0.014620396308600903, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.359375, "completions/mean_terminated_length": 169.359375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.25947123765945435, "epoch": 1.8259803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.4159056667900753, "kl": 0.12732058763504028, "learning_rate": 4.005778262800571e-07, "loss": -0.0308, "num_tokens": 47042592.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004578828811646, "sampling/importance_sampling_ratio/min": 0.3742866814136505, "sampling/sampling_logp_difference/max": 0.9827332496643066, "sampling/sampling_logp_difference/mean": 0.015355970710515976, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2130059003829956, "epoch": 1.8272058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.8738946641892246, "kl": 0.07923636585474014, "learning_rate": 3.9987977358226175e-07, "loss": 0.0033, "num_tokens": 47074416.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6526607275009155, "sampling/importance_sampling_ratio/mean": 1.0003128051757812, "sampling/importance_sampling_ratio/min": 0.38447070121765137, "sampling/sampling_logp_difference/max": 0.9558877944946289, "sampling/sampling_logp_difference/mean": 0.01331046037375927, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 289.15625, "completions/mean_terminated_length": 289.15625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.36798161268234253, "epoch": 1.8284313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.9583959489737331, "kl": 0.09859618544578552, "learning_rate": 3.991819241221835e-07, "loss": 0.0271, "num_tokens": 47125066.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995404481887817, "sampling/importance_sampling_ratio/min": 0.48147302865982056, "sampling/sampling_logp_difference/max": 0.8621160984039307, "sampling/sampling_logp_difference/mean": 0.018618982285261154, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 228.578125, "completions/mean_terminated_length": 228.578125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3434247672557831, "epoch": 1.829656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.261896640877614, "kl": 0.1256704330444336, "learning_rate": 3.98484279316412e-07, "loss": 0.0092, "num_tokens": 47161151.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7861305475234985, "sampling/importance_sampling_ratio/mean": 0.9994518756866455, "sampling/importance_sampling_ratio/min": 0.42127373814582825, "sampling/sampling_logp_difference/max": 0.8644723892211914, "sampling/sampling_logp_difference/mean": 0.01720285415649414, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 198.203125, "completions/mean_terminated_length": 198.203125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.19486339390277863, "epoch": 1.8308823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.0524757162358044, "kl": 0.07670879364013672, "learning_rate": 3.977868405811223e-07, "loss": 0.0007, "num_tokens": 47189228.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9442415237426758, "sampling/importance_sampling_ratio/mean": 1.0005048513412476, "sampling/importance_sampling_ratio/min": 0.4719997048377991, "sampling/sampling_logp_difference/max": 0.7507768869400024, "sampling/sampling_logp_difference/mean": 0.012243038043379784, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 217.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2507721185684204, "epoch": 1.8321078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.04301824966306972, "kl": 0.07585926353931427, "learning_rate": 3.970896093320708e-07, "loss": 0.0008, "num_tokens": 47220700.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6032006740570068, "sampling/importance_sampling_ratio/mean": 1.0000191926956177, "sampling/importance_sampling_ratio/min": 0.33986422419548035, "sampling/sampling_logp_difference/max": 1.0792090892791748, "sampling/sampling_logp_difference/mean": 0.014406761154532433, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.33627083897590637, "epoch": 1.8333333333333335, "frac_reward_zero_std": 0.25, "grad_norm": 2.0551102354014685, "kl": 0.12954556941986084, "learning_rate": 3.9639258698459287e-07, "loss": 0.0234, "num_tokens": 47251628.0, "reward": 0.65625, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.601887822151184, "sampling/importance_sampling_ratio/mean": 1.0008529424667358, "sampling/importance_sampling_ratio/min": 0.6152583360671997, "sampling/sampling_logp_difference/max": 0.48571306467056274, "sampling/sampling_logp_difference/mean": 0.016873590648174286, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.22614750266075134, "epoch": 1.8345588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.2858808512563478, "kl": 0.0691496804356575, "learning_rate": 3.9569577495359964e-07, "loss": -0.0193, "num_tokens": 47287236.0, "reward": 0.625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.8390065431594849, "sampling/importance_sampling_ratio/mean": 0.9997743368148804, "sampling/importance_sampling_ratio/min": 0.4419277310371399, "sampling/sampling_logp_difference/max": 0.8166089057922363, "sampling/sampling_logp_difference/mean": 0.011157220229506493, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 205.5625, "completions/mean_terminated_length": 205.5625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3292863070964813, "epoch": 1.8357843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 1.4682332015823245, "kl": 0.1530056744813919, "learning_rate": 3.949991746535753e-07, "loss": -0.0221, "num_tokens": 47317560.0, "reward": 0.34375, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5506867170333862, "sampling/importance_sampling_ratio/mean": 0.999796450138092, "sampling/importance_sampling_ratio/min": 0.5594758987426758, "sampling/sampling_logp_difference/max": 0.5807547569274902, "sampling/sampling_logp_difference/mean": 0.01646328717470169, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.19123998284339905, "epoch": 1.8370098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.05356769945636859, "kl": 0.08881576359272003, "learning_rate": 3.943027874985746e-07, "loss": 0.0008, "num_tokens": 47349600.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9949634075164795, "sampling/importance_sampling_ratio/mean": 1.0006195306777954, "sampling/importance_sampling_ratio/min": 0.5720028281211853, "sampling/sampling_logp_difference/max": 0.6906256675720215, "sampling/sampling_logp_difference/mean": 0.012663639150559902, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 198.140625, "completions/mean_terminated_length": 198.140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.22961711883544922, "epoch": 1.8382352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.09255699113039914, "kl": 0.08062359690666199, "learning_rate": 3.9360661490221904e-07, "loss": 0.0008, "num_tokens": 47388361.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9732640981674194, "sampling/importance_sampling_ratio/mean": 0.9990033507347107, "sampling/importance_sampling_ratio/min": 0.46037212014198303, "sampling/sampling_logp_difference/max": 0.7757201194763184, "sampling/sampling_logp_difference/mean": 0.014116080477833748, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 213.375, "completions/mean_terminated_length": 213.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2590044438838959, "epoch": 1.8394607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.4592039551549316, "kl": 0.1066092699766159, "learning_rate": 3.929106582776948e-07, "loss": 0.0136, "num_tokens": 47419361.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6243494749069214, "sampling/importance_sampling_ratio/mean": 0.9995856881141663, "sampling/importance_sampling_ratio/min": 0.48131364583969116, "sampling/sampling_logp_difference/max": 0.7312362194061279, "sampling/sampling_logp_difference/mean": 0.014168147929012775, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 171.4375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3639901876449585, "epoch": 1.840686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.304178011378884, "kl": 0.1234322190284729, "learning_rate": 3.9221491903775013e-07, "loss": 0.0157, "num_tokens": 47452429.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.475376844406128, "sampling/importance_sampling_ratio/mean": 1.0000149011611938, "sampling/importance_sampling_ratio/min": 0.613954484462738, "sampling/sampling_logp_difference/max": 0.48783445358276367, "sampling/sampling_logp_difference/mean": 0.01782539114356041, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 187.171875, "completions/mean_terminated_length": 187.171875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2886807918548584, "epoch": 1.8419117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.9699376969092584, "kl": 0.10959329456090927, "learning_rate": 3.9151939859469166e-07, "loss": -0.0429, "num_tokens": 47483240.0, "reward": 0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7225044965744019, "sampling/importance_sampling_ratio/mean": 0.9997414350509644, "sampling/importance_sampling_ratio/min": 0.4769878685474396, "sampling/sampling_logp_difference/max": 0.7402641773223877, "sampling/sampling_logp_difference/mean": 0.01478142011910677, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2390981912612915, "epoch": 1.843137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.5250096918124199, "kl": 0.108737051486969, "learning_rate": 3.908240983603813e-07, "loss": 0.0073, "num_tokens": 47512808.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.552701711654663, "sampling/importance_sampling_ratio/mean": 0.999780535697937, "sampling/importance_sampling_ratio/min": 0.6056217551231384, "sampling/sampling_logp_difference/max": 0.5014996528625488, "sampling/sampling_logp_difference/mean": 0.012281188741326332, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 193.375, "completions/mean_terminated_length": 193.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.21787826716899872, "epoch": 1.844362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.3875391936463561, "kl": 0.09802134335041046, "learning_rate": 3.9012901974623476e-07, "loss": 0.0085, "num_tokens": 47540016.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8179765939712524, "sampling/importance_sampling_ratio/mean": 0.9997023344039917, "sampling/importance_sampling_ratio/min": 0.5782334208488464, "sampling/sampling_logp_difference/max": 0.5977240800857544, "sampling/sampling_logp_difference/mean": 0.013284020125865936, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 245.71875, "completions/mean_terminated_length": 245.71875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.28870612382888794, "epoch": 1.8455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.22860420168087, "kl": 0.10918144136667252, "learning_rate": 3.894341641632176e-07, "loss": -0.0241, "num_tokens": 47579214.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6286340951919556, "sampling/importance_sampling_ratio/mean": 0.9998392462730408, "sampling/importance_sampling_ratio/min": 0.6145747303962708, "sampling/sampling_logp_difference/max": 0.48774170875549316, "sampling/sampling_logp_difference/mean": 0.014949234202504158, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 227.46875, "completions/mean_terminated_length": 227.46875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2842986583709717, "epoch": 1.846813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.0996649217388619, "kl": 0.11265279352664948, "learning_rate": 3.8873953302184283e-07, "loss": -0.004, "num_tokens": 47613676.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6251676082611084, "sampling/importance_sampling_ratio/mean": 1.0002528429031372, "sampling/importance_sampling_ratio/min": 0.6164861917495728, "sampling/sampling_logp_difference/max": 0.4856109619140625, "sampling/sampling_logp_difference/mean": 0.014792348258197308, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 226.34375, "completions/mean_terminated_length": 226.34375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2616327404975891, "epoch": 1.8480392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.2053015883865053, "kl": 0.09128358960151672, "learning_rate": 3.880451277321673e-07, "loss": -0.0056, "num_tokens": 47645954.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6281298398971558, "sampling/importance_sampling_ratio/mean": 0.9998895525932312, "sampling/importance_sampling_ratio/min": 0.6181973814964294, "sampling/sampling_logp_difference/max": 0.48743200302124023, "sampling/sampling_logp_difference/mean": 0.014848420396447182, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 138.84375, "completions/mean_terminated_length": 138.84375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.16786305606365204, "epoch": 1.8492647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.06030377569260939, "kl": 0.08539316058158875, "learning_rate": 3.873509497037899e-07, "loss": 0.0009, "num_tokens": 47674200.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.55501389503479, "sampling/importance_sampling_ratio/mean": 0.9998781085014343, "sampling/importance_sampling_ratio/min": 0.6359153389930725, "sampling/sampling_logp_difference/max": 0.45268988609313965, "sampling/sampling_logp_difference/mean": 0.011962493881583214, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2259249985218048, "epoch": 1.8504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.189394489035492, "kl": 0.08550411462783813, "learning_rate": 3.8665700034584834e-07, "loss": -0.008, "num_tokens": 47706040.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001052618026733, "sampling/importance_sampling_ratio/min": 0.6622968316078186, "sampling/sampling_logp_difference/max": 0.8150925636291504, "sampling/sampling_logp_difference/mean": 0.013018874451518059, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 151.9375, "completions/mean_terminated_length": 151.9375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.21084097027778625, "epoch": 1.8517156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.1387918498637539, "kl": 0.10547801852226257, "learning_rate": 3.8596328106701533e-07, "loss": 0.0011, "num_tokens": 47729732.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6024487018585205, "sampling/importance_sampling_ratio/mean": 1.0001769065856934, "sampling/importance_sampling_ratio/min": 0.6019724607467651, "sampling/sampling_logp_difference/max": 0.5075435638427734, "sampling/sampling_logp_difference/mean": 0.013056870549917221, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 183.203125, "completions/mean_terminated_length": 183.203125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3111172914505005, "epoch": 1.8529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.7726465743314133, "kl": 0.12318593263626099, "learning_rate": 3.8526979327549736e-07, "loss": 0.0036, "num_tokens": 47764209.0, "reward": 0.59375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5539941787719727, "sampling/importance_sampling_ratio/mean": 1.0003302097320557, "sampling/importance_sampling_ratio/min": 0.4000977575778961, "sampling/sampling_logp_difference/max": 0.9160463809967041, "sampling/sampling_logp_difference/mean": 0.01625010371208191, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 164.34375, "completions/mean_terminated_length": 164.34375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2743145823478699, "epoch": 1.8541666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.051260549816440265, "kl": 0.1191352903842926, "learning_rate": 3.845765383790306e-07, "loss": 0.0012, "num_tokens": 47790615.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5348610877990723, "sampling/importance_sampling_ratio/mean": 1.0002870559692383, "sampling/importance_sampling_ratio/min": 0.6147403120994568, "sampling/sampling_logp_difference/max": 0.4865553379058838, "sampling/sampling_logp_difference/mean": 0.01568988896906376, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 164.21875, "completions/mean_terminated_length": 164.21875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3106948137283325, "epoch": 1.8553921568627452, "frac_reward_zero_std": 0.5, "grad_norm": 1.8744626546891598, "kl": 0.20338189601898193, "learning_rate": 3.8388351778487875e-07, "loss": 0.0002, "num_tokens": 47820357.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9546488523483276, "sampling/importance_sampling_ratio/mean": 1.000866174697876, "sampling/importance_sampling_ratio/min": 0.17713141441345215, "sampling/sampling_logp_difference/max": 1.730863332748413, "sampling/sampling_logp_difference/mean": 0.015201722271740437, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.26740169525146484, "epoch": 1.8566176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.556234841472086, "kl": 0.13963264226913452, "learning_rate": 3.831907328998295e-07, "loss": 0.0026, "num_tokens": 47850469.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994199275970459, "sampling/importance_sampling_ratio/min": 0.4526996910572052, "sampling/sampling_logp_difference/max": 0.9892764091491699, "sampling/sampling_logp_difference/mean": 0.015211023390293121, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 150.96875, "completions/mean_terminated_length": 150.96875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.21206092834472656, "epoch": 1.857843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.5210169519150867, "kl": 0.08909106254577637, "learning_rate": 3.824981851301924e-07, "loss": -0.0137, "num_tokens": 47877555.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.3761188983917236, "sampling/importance_sampling_ratio/mean": 1.0001142024993896, "sampling/importance_sampling_ratio/min": 0.500291645526886, "sampling/sampling_logp_difference/max": 0.6925640106201172, "sampling/sampling_logp_difference/mean": 0.012899190187454224, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 162.59375, "completions/mean_terminated_length": 162.59375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2940405309200287, "epoch": 1.8590686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 1.9957742558492082, "kl": 0.17864391207695007, "learning_rate": 3.818058758817955e-07, "loss": 0.0082, "num_tokens": 47908361.0, "reward": 0.28125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6625418663024902, "sampling/importance_sampling_ratio/mean": 1.0001530647277832, "sampling/importance_sampling_ratio/min": 0.5949472188949585, "sampling/sampling_logp_difference/max": 0.5192825794219971, "sampling/sampling_logp_difference/mean": 0.014901855029165745, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 160.71875, "completions/mean_terminated_length": 160.71875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2383217215538025, "epoch": 1.8602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.053719037874208904, "kl": 0.09652819484472275, "learning_rate": 3.81113806559983e-07, "loss": 0.001, "num_tokens": 47934455.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5130321979522705, "sampling/importance_sampling_ratio/mean": 1.00014328956604, "sampling/importance_sampling_ratio/min": 0.5200313329696655, "sampling/sampling_logp_difference/max": 0.6538662910461426, "sampling/sampling_logp_difference/mean": 0.01406307052820921, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2827237546443939, "epoch": 1.8615196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 1.3917265181523395, "kl": 0.19590789079666138, "learning_rate": 3.804219785696113e-07, "loss": -0.0125, "num_tokens": 47958239.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.628027319908142, "sampling/importance_sampling_ratio/mean": 1.0000962018966675, "sampling/importance_sampling_ratio/min": 0.6129540801048279, "sampling/sampling_logp_difference/max": 0.48946523666381836, "sampling/sampling_logp_difference/mean": 0.015526263043284416, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 174.9375, "completions/mean_terminated_length": 174.9375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.19941727817058563, "epoch": 1.8627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.2961982281604962, "kl": 0.09546373784542084, "learning_rate": 3.797303933150475e-07, "loss": 0.0076, "num_tokens": 47983739.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5509536266326904, "sampling/importance_sampling_ratio/mean": 1.0001888275146484, "sampling/importance_sampling_ratio/min": 0.5733482837677002, "sampling/sampling_logp_difference/max": 0.5562620162963867, "sampling/sampling_logp_difference/mean": 0.012389476411044598, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 227.15625, "completions/mean_terminated_length": 227.15625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23176491260528564, "epoch": 1.8639705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.9506818974268831, "kl": 0.08148758858442307, "learning_rate": 3.790390522001662e-07, "loss": -0.0287, "num_tokens": 48020021.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.8623803853988647, "sampling/importance_sampling_ratio/mean": 1.0003161430358887, "sampling/importance_sampling_ratio/min": 0.4322664439678192, "sampling/sampling_logp_difference/max": 0.8387131690979004, "sampling/sampling_logp_difference/mean": 0.013027187436819077, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.28213566541671753, "epoch": 1.8651960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.6442198217386463, "kl": 0.14837434887886047, "learning_rate": 3.7834795662834566e-07, "loss": 0.0212, "num_tokens": 48050085.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6636914014816284, "sampling/importance_sampling_ratio/mean": 1.000145673751831, "sampling/importance_sampling_ratio/min": 0.6549220085144043, "sampling/sampling_logp_difference/max": 0.5090389251708984, "sampling/sampling_logp_difference/mean": 0.013476305641233921, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 243.46875, "completions/mean_terminated_length": 243.46875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3794707655906677, "epoch": 1.866421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.4424293410595619, "kl": 0.1346874237060547, "learning_rate": 3.776571080024663e-07, "loss": 0.0186, "num_tokens": 48089443.0, "reward": 0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4868202209472656, "sampling/importance_sampling_ratio/mean": 1.0007059574127197, "sampling/importance_sampling_ratio/min": 0.47760945558547974, "sampling/sampling_logp_difference/max": 0.738961935043335, "sampling/sampling_logp_difference/mean": 0.017995767295360565, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 181.921875, "completions/mean_terminated_length": 181.921875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3146703243255615, "epoch": 1.8676470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.175896482969458, "kl": 0.10516366362571716, "learning_rate": 3.76966507724907e-07, "loss": 0.0082, "num_tokens": 48125550.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009829998016357, "sampling/importance_sampling_ratio/min": 0.12568892538547516, "sampling/sampling_logp_difference/max": 2.2150397300720215, "sampling/sampling_logp_difference/mean": 0.017225069925189018, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 234.796875, "completions/mean_terminated_length": 234.796875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.34670189023017883, "epoch": 1.8688725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.5481841591682168, "kl": 0.08109765499830246, "learning_rate": 3.762761571975429e-07, "loss": 0.0103, "num_tokens": 48162801.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6302809715270996, "sampling/importance_sampling_ratio/mean": 0.9997133016586304, "sampling/importance_sampling_ratio/min": 0.5842203497886658, "sampling/sampling_logp_difference/max": 0.5374770164489746, "sampling/sampling_logp_difference/mean": 0.017425820231437683, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 205.21875, "completions/mean_terminated_length": 205.21875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3151162266731262, "epoch": 1.8700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.3857105972380215, "kl": 0.11170510202646255, "learning_rate": 3.755860578217413e-07, "loss": -0.0017, "num_tokens": 48195327.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6599576473236084, "sampling/importance_sampling_ratio/mean": 0.99996018409729, "sampling/importance_sampling_ratio/min": 0.4196990132331848, "sampling/sampling_logp_difference/max": 0.8682174682617188, "sampling/sampling_logp_difference/mean": 0.01472011860460043, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 181.171875, "completions/mean_terminated_length": 181.171875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2721059322357178, "epoch": 1.8713235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.3365972261907422, "kl": 0.12668752670288086, "learning_rate": 3.7489621099836043e-07, "loss": -0.0189, "num_tokens": 48223498.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6299506425857544, "sampling/importance_sampling_ratio/mean": 0.9992806911468506, "sampling/importance_sampling_ratio/min": 0.37794965505599976, "sampling/sampling_logp_difference/max": 0.972994327545166, "sampling/sampling_logp_difference/mean": 0.014614992775022984, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 184.21875, "completions/mean_terminated_length": 184.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27154216170310974, "epoch": 1.8725490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.1836721743103784, "kl": 0.120089091360569, "learning_rate": 3.742066181277457e-07, "loss": 0.0066, "num_tokens": 48256392.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.0003831386566162, "sampling/importance_sampling_ratio/min": 0.495414674282074, "sampling/sampling_logp_difference/max": 0.7023601531982422, "sampling/sampling_logp_difference/mean": 0.014493845403194427, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 225.78125, "completions/mean_terminated_length": 225.78125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2942451536655426, "epoch": 1.8737745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.3398500817617414, "kl": 0.09755627810955048, "learning_rate": 3.735172806097271e-07, "loss": -0.0237, "num_tokens": 48291258.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.603361964225769, "sampling/importance_sampling_ratio/mean": 1.0005296468734741, "sampling/importance_sampling_ratio/min": 0.5283520817756653, "sampling/sampling_logp_difference/max": 0.6379923820495605, "sampling/sampling_logp_difference/mean": 0.015455886721611023, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 189.703125, "completions/mean_terminated_length": 189.703125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.21763397753238678, "epoch": 1.875, "frac_reward_zero_std": 0.75, "grad_norm": 1.2386752291814265, "kl": 0.11837249249219894, "learning_rate": 3.7282819984361577e-07, "loss": 0.0011, "num_tokens": 48321255.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6312732696533203, "sampling/importance_sampling_ratio/mean": 0.9998139142990112, "sampling/importance_sampling_ratio/min": 0.4972875118255615, "sampling/sampling_logp_difference/max": 0.6985869407653809, "sampling/sampling_logp_difference/mean": 0.012105113826692104, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 195.8125, "completions/mean_terminated_length": 195.8125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.26901644468307495, "epoch": 1.8762254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.2460732346461223, "kl": 0.10338087379932404, "learning_rate": 3.721393772282022e-07, "loss": 0.0037, "num_tokens": 48349707.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.463740348815918, "sampling/importance_sampling_ratio/mean": 0.9998382329940796, "sampling/importance_sampling_ratio/min": 0.4661833643913269, "sampling/sampling_logp_difference/max": 0.7631762027740479, "sampling/sampling_logp_difference/mean": 0.013834136538207531, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 188.515625, "completions/mean_terminated_length": 188.515625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2946900725364685, "epoch": 1.8774509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 1.913459932908527, "kl": 0.15940743684768677, "learning_rate": 3.7145081416175264e-07, "loss": 0.0169, "num_tokens": 48379468.0, "reward": 0.625, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7782058715820312, "sampling/importance_sampling_ratio/mean": 0.999894380569458, "sampling/importance_sampling_ratio/min": 0.6610523462295532, "sampling/sampling_logp_difference/max": 0.5756049156188965, "sampling/sampling_logp_difference/mean": 0.014666395261883736, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 199.671875, "completions/mean_terminated_length": 199.671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.31731051206588745, "epoch": 1.8786764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.9024912622749615, "kl": 0.14467984437942505, "learning_rate": 3.7076251204200667e-07, "loss": -0.1033, "num_tokens": 48410983.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000901222229004, "sampling/importance_sampling_ratio/min": 0.6036661267280579, "sampling/sampling_logp_difference/max": 1.132408618927002, "sampling/sampling_logp_difference/mean": 0.01580166071653366, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 170.671875, "completions/mean_terminated_length": 170.671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2762795686721802, "epoch": 1.8799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.6546919602424224, "kl": 0.11146007478237152, "learning_rate": 3.700744722661736e-07, "loss": 0.0011, "num_tokens": 48435954.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4841010570526123, "sampling/importance_sampling_ratio/mean": 0.9997158646583557, "sampling/importance_sampling_ratio/min": 0.6257337927818298, "sampling/sampling_logp_difference/max": 0.4688303470611572, "sampling/sampling_logp_difference/mean": 0.014529142528772354, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 146.40625, "completions/mean_terminated_length": 146.40625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.26064950227737427, "epoch": 1.8811274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.08540739013399869, "kl": 0.11784351617097855, "learning_rate": 3.693866962309308e-07, "loss": 0.0012, "num_tokens": 48465788.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6977688074111938, "sampling/importance_sampling_ratio/mean": 1.0002050399780273, "sampling/importance_sampling_ratio/min": 0.6122998595237732, "sampling/sampling_logp_difference/max": 0.5293148756027222, "sampling/sampling_logp_difference/mean": 0.014467135071754456, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 187.703125, "completions/mean_terminated_length": 187.703125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.34977987408638, "epoch": 1.8823529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.0112208247999641, "kl": 0.1438819169998169, "learning_rate": 3.686991853324202e-07, "loss": 0.0137, "num_tokens": 48497017.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 0.99979168176651, "sampling/importance_sampling_ratio/min": 0.6265778541564941, "sampling/sampling_logp_difference/max": 0.475541353225708, "sampling/sampling_logp_difference/mean": 0.017088035121560097, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 191.1875, "completions/mean_terminated_length": 191.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2127748429775238, "epoch": 1.883578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.05688519141386496, "kl": 0.07666490226984024, "learning_rate": 3.680119409662451e-07, "loss": 0.0007, "num_tokens": 48526053.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8571991920471191, "sampling/importance_sampling_ratio/mean": 0.9998555183410645, "sampling/importance_sampling_ratio/min": 0.4798246920108795, "sampling/sampling_logp_difference/max": 0.7343344688415527, "sampling/sampling_logp_difference/mean": 0.012846048921346664, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 202.640625, "completions/mean_terminated_length": 202.640625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3353343605995178, "epoch": 1.8848039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.813218533983896, "kl": 0.1376620978116989, "learning_rate": 3.673249645274682e-07, "loss": -0.0391, "num_tokens": 48557358.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.8516814708709717, "sampling/importance_sampling_ratio/mean": 1.0001487731933594, "sampling/importance_sampling_ratio/min": 0.47982609272003174, "sampling/sampling_logp_difference/max": 0.7343316078186035, "sampling/sampling_logp_difference/mean": 0.016058053821325302, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 166.265625, "completions/mean_terminated_length": 166.265625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27629363536834717, "epoch": 1.8860294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.04776955300620745, "kl": 0.09619492292404175, "learning_rate": 3.6663825741060805e-07, "loss": 0.001, "num_tokens": 48588079.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997478723526001, "sampling/importance_sampling_ratio/min": 0.3985402286052704, "sampling/sampling_logp_difference/max": 0.9277105331420898, "sampling/sampling_logp_difference/mean": 0.017165351659059525, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 174.671875, "completions/mean_terminated_length": 174.671875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.29174095392227173, "epoch": 1.8872549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.5602647270473926, "kl": 0.11156705021858215, "learning_rate": 3.6595182100963686e-07, "loss": 0.0035, "num_tokens": 48614938.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6307547092437744, "sampling/importance_sampling_ratio/mean": 0.9993473887443542, "sampling/importance_sampling_ratio/min": 0.5263680815696716, "sampling/sampling_logp_difference/max": 0.6417546272277832, "sampling/sampling_logp_difference/mean": 0.014674471691250801, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 203.859375, "completions/mean_terminated_length": 203.859375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.28613030910491943, "epoch": 1.8884803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.5430968543347237, "kl": 0.16828909516334534, "learning_rate": 3.652656567179765e-07, "loss": 0.0035, "num_tokens": 48643457.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.9935235977172852, "sampling/importance_sampling_ratio/mean": 0.999808669090271, "sampling/importance_sampling_ratio/min": 0.6217340230941772, "sampling/sampling_logp_difference/max": 0.689903736114502, "sampling/sampling_logp_difference/mean": 0.01587986946105957, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 182.09375, "completions/mean_terminated_length": 182.09375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.30671757459640503, "epoch": 1.8897058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.6157744644914547, "kl": 0.15455128252506256, "learning_rate": 3.645797659284975e-07, "loss": 0.0167, "num_tokens": 48669799.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6490719318389893, "sampling/importance_sampling_ratio/mean": 1.0000280141830444, "sampling/importance_sampling_ratio/min": 0.6067567467689514, "sampling/sampling_logp_difference/max": 0.5002126693725586, "sampling/sampling_logp_difference/mean": 0.01575349271297455, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 203.21875, "completions/mean_terminated_length": 203.21875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3348330855369568, "epoch": 1.8909313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 1.5496038380993364, "kl": 0.1350516676902771, "learning_rate": 3.638941500335144e-07, "loss": 0.0338, "num_tokens": 48700517.0, "reward": -0.125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.96829354763031, "sampling/importance_sampling_ratio/mean": 0.9999657273292542, "sampling/importance_sampling_ratio/min": 0.605185866355896, "sampling/sampling_logp_difference/max": 0.6771669387817383, "sampling/sampling_logp_difference/mean": 0.01601865142583847, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 237.171875, "completions/mean_terminated_length": 237.171875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.35526013374328613, "epoch": 1.892156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.340869176986576, "kl": 0.1703725904226303, "learning_rate": 3.6320881042478433e-07, "loss": -0.0082, "num_tokens": 48731824.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7339519262313843, "sampling/importance_sampling_ratio/mean": 0.999858021736145, "sampling/importance_sampling_ratio/min": 0.35683271288871765, "sampling/sampling_logp_difference/max": 1.0304882526397705, "sampling/sampling_logp_difference/mean": 0.017715346068143845, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 150.1875, "completions/mean_terminated_length": 150.1875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.219431072473526, "epoch": 1.8933823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 1.6490211239168067, "kl": 0.1438380926847458, "learning_rate": 3.6252374849350303e-07, "loss": -0.0216, "num_tokens": 48759916.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.85312819480896, "sampling/importance_sampling_ratio/mean": 0.9996017217636108, "sampling/importance_sampling_ratio/min": 0.629998505115509, "sampling/sampling_logp_difference/max": 0.616875171661377, "sampling/sampling_logp_difference/mean": 0.013308055698871613, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 249.546875, "completions/mean_terminated_length": 249.546875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22123563289642334, "epoch": 1.8946078431372548, "frac_reward_zero_std": 0.5, "grad_norm": 1.516266467309043, "kl": 0.08375102281570435, "learning_rate": 3.618389656303029e-07, "loss": -0.0845, "num_tokens": 48794687.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8146148920059204, "sampling/importance_sampling_ratio/mean": 0.999570369720459, "sampling/importance_sampling_ratio/min": 0.2815396189689636, "sampling/sampling_logp_difference/max": 1.267482042312622, "sampling/sampling_logp_difference/mean": 0.012413117103278637, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 202.84375, "completions/mean_terminated_length": 202.84375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.278066486120224, "epoch": 1.8958333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.857750462868285, "kl": 0.09571290016174316, "learning_rate": 3.6115446322525e-07, "loss": -0.083, "num_tokens": 48829205.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9986664056777954, "sampling/importance_sampling_ratio/min": 0.20802132785320282, "sampling/sampling_logp_difference/max": 1.5701146125793457, "sampling/sampling_logp_difference/mean": 0.01835343800485134, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 216.421875, "completions/mean_terminated_length": 216.421875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3158535957336426, "epoch": 1.8970588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.3937074063685995, "kl": 0.11325500160455704, "learning_rate": 3.6047024266784035e-07, "loss": -0.0164, "num_tokens": 48872864.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6009771823883057, "sampling/importance_sampling_ratio/mean": 1.0001379251480103, "sampling/importance_sampling_ratio/min": 0.44403478503227234, "sampling/sampling_logp_difference/max": 0.8118524551391602, "sampling/sampling_logp_difference/mean": 0.01701243221759796, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 246.6875, "completions/mean_terminated_length": 246.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.33020949363708496, "epoch": 1.8982843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.3909020629085493, "kl": 0.11101092398166656, "learning_rate": 3.5978630534699865e-07, "loss": -0.032, "num_tokens": 48909500.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.8210878372192383, "sampling/importance_sampling_ratio/mean": 1.0001590251922607, "sampling/importance_sampling_ratio/min": 0.5914841890335083, "sampling/sampling_logp_difference/max": 0.5994340181350708, "sampling/sampling_logp_difference/mean": 0.015389536507427692, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 221.359375, "completions/mean_terminated_length": 221.359375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.29521629214286804, "epoch": 1.8995098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.7727821848863863, "kl": 0.08533985912799835, "learning_rate": 3.591026526510742e-07, "loss": -0.0259, "num_tokens": 48944819.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000815510749817, "sampling/importance_sampling_ratio/min": 0.5918084979057312, "sampling/sampling_logp_difference/max": 0.7039074897766113, "sampling/sampling_logp_difference/mean": 0.014909489080309868, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.31491348147392273, "epoch": 1.9007352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 1.3101563688698865, "kl": 0.09991556406021118, "learning_rate": 3.584192859678391e-07, "loss": 0.0041, "num_tokens": 48977059.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8134992122650146, "sampling/importance_sampling_ratio/mean": 0.9999299049377441, "sampling/importance_sampling_ratio/min": 0.4001094698905945, "sampling/sampling_logp_difference/max": 0.9160170555114746, "sampling/sampling_logp_difference/mean": 0.015464898198843002, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 227.96875, "completions/mean_terminated_length": 227.96875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.371176540851593, "epoch": 1.9019607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.650144214127249, "kl": 0.12711381912231445, "learning_rate": 3.577362066844838e-07, "loss": 0.0399, "num_tokens": 49009761.0, "reward": 0.3125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6179018020629883, "sampling/importance_sampling_ratio/mean": 0.9994966387748718, "sampling/importance_sampling_ratio/min": 0.5008446574211121, "sampling/sampling_logp_difference/max": 0.6914592981338501, "sampling/sampling_logp_difference/mean": 0.017303530126810074, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 183.859375, "completions/mean_terminated_length": 183.859375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.25452548265457153, "epoch": 1.903186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.3072688658521017, "kl": 0.12985754013061523, "learning_rate": 3.570534161876163e-07, "loss": 0.021, "num_tokens": 49036280.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999993622303009, "sampling/importance_sampling_ratio/min": 0.4193432331085205, "sampling/sampling_logp_difference/max": 0.8690655827522278, "sampling/sampling_logp_difference/mean": 0.014714433811604977, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 204.515625, "completions/mean_terminated_length": 204.515625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2913321554660797, "epoch": 1.9044117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.3504290648689279, "kl": 0.08884111791849136, "learning_rate": 3.5637091586325796e-07, "loss": -0.0062, "num_tokens": 49072265.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002952814102173, "sampling/importance_sampling_ratio/min": 0.4834056496620178, "sampling/sampling_logp_difference/max": 0.931877613067627, "sampling/sampling_logp_difference/mean": 0.01549257431179285, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 201.984375, "completions/mean_terminated_length": 201.984375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2502061128616333, "epoch": 1.905637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.6647828098078714, "kl": 0.10091829299926758, "learning_rate": 3.556887070968414e-07, "loss": 0.0131, "num_tokens": 49103464.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007450580596924, "sampling/importance_sampling_ratio/min": 0.48473289608955383, "sampling/sampling_logp_difference/max": 0.7275445461273193, "sampling/sampling_logp_difference/mean": 0.014591528102755547, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 211.03125, "completions/mean_terminated_length": 211.03125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.30164915323257446, "epoch": 1.906862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.9772018158037367, "kl": 0.09650573134422302, "learning_rate": 3.550067912732069e-07, "loss": 0.021, "num_tokens": 49137610.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 1.0000648498535156, "sampling/importance_sampling_ratio/min": 0.6071924567222595, "sampling/sampling_logp_difference/max": 0.49890947341918945, "sampling/sampling_logp_difference/mean": 0.015118611045181751, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 224.546875, "completions/mean_terminated_length": 224.546875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3440561890602112, "epoch": 1.9080882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.6866728597437486, "kl": 0.10658356547355652, "learning_rate": 3.5432516977660054e-07, "loss": 0.0174, "num_tokens": 49169485.0, "reward": 0.3125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993762373924255, "sampling/importance_sampling_ratio/min": 0.6254568696022034, "sampling/sampling_logp_difference/max": 0.7467336654663086, "sampling/sampling_logp_difference/mean": 0.016454674303531647, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 197.78125, "completions/mean_terminated_length": 197.78125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.20059680938720703, "epoch": 1.909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.04640111395933607, "kl": 0.09063167124986649, "learning_rate": 3.5364384399067094e-07, "loss": 0.0009, "num_tokens": 49199887.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993793964385986, "sampling/importance_sampling_ratio/min": 0.41134488582611084, "sampling/sampling_logp_difference/max": 0.8883233070373535, "sampling/sampling_logp_difference/mean": 0.012949159368872643, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 179.515625, "completions/mean_terminated_length": 179.515625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3543946444988251, "epoch": 1.9105392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 2.382435530039299, "kl": 0.12880370020866394, "learning_rate": 3.5296281529846593e-07, "loss": 0.0287, "num_tokens": 49241648.0, "reward": 0.125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.8798727989196777, "sampling/importance_sampling_ratio/mean": 1.0006365776062012, "sampling/importance_sampling_ratio/min": 0.42330774664878845, "sampling/sampling_logp_difference/max": 0.8596558570861816, "sampling/sampling_logp_difference/mean": 0.019419439136981964, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 176.78125, "completions/mean_terminated_length": 176.78125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23964226245880127, "epoch": 1.9117647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 1.3608993259996585, "kl": 0.11298610270023346, "learning_rate": 3.5228208508243073e-07, "loss": 0.0092, "num_tokens": 49267362.0, "reward": -0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6379655599594116, "sampling/importance_sampling_ratio/mean": 1.0000442266464233, "sampling/importance_sampling_ratio/min": 0.6133005619049072, "sampling/sampling_logp_difference/max": 0.4934549331665039, "sampling/sampling_logp_difference/mean": 0.013549616560339928, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 203.671875, "completions/mean_terminated_length": 203.671875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.23592735826969147, "epoch": 1.9129901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.1011763669808319, "kl": 0.09119807183742523, "learning_rate": 3.5160165472440467e-07, "loss": 0.0155, "num_tokens": 49298365.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4173837900161743, "sampling/importance_sampling_ratio/mean": 0.9997701048851013, "sampling/importance_sampling_ratio/min": 0.3053518831729889, "sampling/sampling_logp_difference/max": 1.1862905025482178, "sampling/sampling_logp_difference/mean": 0.013066626153886318, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 194.796875, "completions/mean_terminated_length": 194.796875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.31049007177352905, "epoch": 1.9142156862745097, "frac_reward_zero_std": 0.25, "grad_norm": 2.568139086966258, "kl": 0.19013534486293793, "learning_rate": 3.509215256056183e-07, "loss": -0.0101, "num_tokens": 49328656.0, "reward": 0.46875, "reward_std": 0.625, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5504263639450073, "sampling/importance_sampling_ratio/mean": 0.9998918771743774, "sampling/importance_sampling_ratio/min": 0.27835866808891296, "sampling/sampling_logp_difference/max": 1.2788448333740234, "sampling/sampling_logp_difference/mean": 0.0173039548099041, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 195.8125, "completions/mean_terminated_length": 195.8125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2837347984313965, "epoch": 1.9154411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.7436632255652411, "kl": 0.09441693127155304, "learning_rate": 3.502416991066904e-07, "loss": 0.0184, "num_tokens": 49357812.0, "reward": 0.4375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5426450967788696, "sampling/importance_sampling_ratio/mean": 0.9997560977935791, "sampling/importance_sampling_ratio/min": 0.6066765785217285, "sampling/sampling_logp_difference/max": 0.4997594356536865, "sampling/sampling_logp_difference/mean": 0.01340518333017826, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 211.078125, "completions/mean_terminated_length": 211.078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.27355197072029114, "epoch": 1.9166666666666665, "frac_reward_zero_std": 0.5, "grad_norm": 1.777329766937861, "kl": 0.07297226041555405, "learning_rate": 3.495621766076259e-07, "loss": 0.0355, "num_tokens": 49389689.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.635582685470581, "sampling/importance_sampling_ratio/mean": 1.000390887260437, "sampling/importance_sampling_ratio/min": 0.6435860395431519, "sampling/sampling_logp_difference/max": 0.49199914932250977, "sampling/sampling_logp_difference/mean": 0.014909939840435982, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 185.359375, "completions/mean_terminated_length": 185.359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.23646235466003418, "epoch": 1.9178921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.05513252817736153, "kl": 0.1208547055721283, "learning_rate": 3.488829594878123e-07, "loss": 0.0011, "num_tokens": 49418960.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.620741367340088, "sampling/importance_sampling_ratio/mean": 0.9998612403869629, "sampling/importance_sampling_ratio/min": 0.5412285923957825, "sampling/sampling_logp_difference/max": 0.6139135360717773, "sampling/sampling_logp_difference/mean": 0.014054106548428535, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 173.453125, "completions/mean_terminated_length": 173.453125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.28601402044296265, "epoch": 1.9191176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.04753205139430537, "kl": 0.09612636268138885, "learning_rate": 3.4820404912601757e-07, "loss": 0.001, "num_tokens": 49451613.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.793167233467102, "sampling/importance_sampling_ratio/mean": 1.0006403923034668, "sampling/importance_sampling_ratio/min": 0.5947459936141968, "sampling/sampling_logp_difference/max": 0.5839834213256836, "sampling/sampling_logp_difference/mean": 0.01622890681028366, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 183.71875, "completions/mean_terminated_length": 183.71875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.20884793996810913, "epoch": 1.920343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.07671820774126588, "kl": 0.07233886420726776, "learning_rate": 3.4752544690038643e-07, "loss": 0.0007, "num_tokens": 49479867.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000246524810791, "sampling/importance_sampling_ratio/min": 0.21938800811767578, "sampling/sampling_logp_difference/max": 1.5169134140014648, "sampling/sampling_logp_difference/mean": 0.012974189594388008, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 220.4375, "completions/mean_terminated_length": 220.4375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3239063620567322, "epoch": 1.9215686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 1.6904427287074242, "kl": 0.150712251663208, "learning_rate": 3.468471541884385e-07, "loss": 0.077, "num_tokens": 49508743.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996858835220337, "sampling/importance_sampling_ratio/min": 0.6147395372390747, "sampling/sampling_logp_difference/max": 0.711000919342041, "sampling/sampling_logp_difference/mean": 0.016206160187721252, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 224.703125, "completions/mean_terminated_length": 224.703125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2642619013786316, "epoch": 1.9227941176470589, "frac_reward_zero_std": 0.25, "grad_norm": 1.8621671333958125, "kl": 0.0999397486448288, "learning_rate": 3.461691723670651e-07, "loss": 0.0122, "num_tokens": 49539140.0, "reward": 0.09375, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6767631769180298, "sampling/importance_sampling_ratio/mean": 0.9997922778129578, "sampling/importance_sampling_ratio/min": 0.4298660457134247, "sampling/sampling_logp_difference/max": 0.8442816734313965, "sampling/sampling_logp_difference/mean": 0.014188846573233604, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 153.328125, "completions/mean_terminated_length": 153.328125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.20939096808433533, "epoch": 1.9240196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.07113598995777741, "kl": 0.08482904732227325, "learning_rate": 3.454915028125263e-07, "loss": 0.0008, "num_tokens": 49566601.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8483213186264038, "sampling/importance_sampling_ratio/mean": 1.0005934238433838, "sampling/importance_sampling_ratio/min": 0.536852240562439, "sampling/sampling_logp_difference/max": 0.6220324039459229, "sampling/sampling_logp_difference/mean": 0.013298461213707924, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 173.796875, "completions/mean_terminated_length": 173.796875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.29373040795326233, "epoch": 1.9252450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.6116127291643694, "kl": 0.10015298426151276, "learning_rate": 3.4481414690044836e-07, "loss": 0.0044, "num_tokens": 49598204.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6278108358383179, "sampling/importance_sampling_ratio/mean": 1.0005826950073242, "sampling/importance_sampling_ratio/min": 0.6176396012306213, "sampling/sampling_logp_difference/max": 0.48723602294921875, "sampling/sampling_logp_difference/mean": 0.015931256115436554, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 197.53125, "completions/mean_terminated_length": 197.53125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.210597962141037, "epoch": 1.9264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.712648102169922, "kl": 0.08676162362098694, "learning_rate": 3.441371060058209e-07, "loss": 0.0443, "num_tokens": 49629326.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0011491775512695, "sampling/importance_sampling_ratio/min": 0.6279329061508179, "sampling/sampling_logp_difference/max": 0.8303616046905518, "sampling/sampling_logp_difference/mean": 0.012944528833031654, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 149.671875, "completions/mean_terminated_length": 149.671875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.253029465675354, "epoch": 1.9276960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0507027200014654, "kl": 0.08234013617038727, "learning_rate": 3.4346038150299425e-07, "loss": 0.0008, "num_tokens": 49651897.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6307547092437744, "sampling/importance_sampling_ratio/mean": 0.999323844909668, "sampling/importance_sampling_ratio/min": 0.5186219811439514, "sampling/sampling_logp_difference/max": 0.6565799713134766, "sampling/sampling_logp_difference/mean": 0.016448473557829857, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2652842104434967, "epoch": 1.928921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.6480919424848328, "kl": 0.10174965858459473, "learning_rate": 3.427839747656758e-07, "loss": 0.0169, "num_tokens": 49683677.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.605762243270874, "sampling/importance_sampling_ratio/mean": 0.9999074935913086, "sampling/importance_sampling_ratio/min": 0.48236799240112305, "sampling/sampling_logp_difference/max": 0.729047954082489, "sampling/sampling_logp_difference/mean": 0.014856329187750816, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.24237948656082153, "epoch": 1.9301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.055333795911757, "kl": 0.10452765226364136, "learning_rate": 3.4210788716692875e-07, "loss": 0.0259, "num_tokens": 49710045.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5084569454193115, "sampling/importance_sampling_ratio/mean": 1.0006563663482666, "sampling/importance_sampling_ratio/min": 0.6150839924812317, "sampling/sampling_logp_difference/max": 0.4859964847564697, "sampling/sampling_logp_difference/mean": 0.012377345934510231, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 175.703125, "completions/mean_terminated_length": 175.703125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.35965946316719055, "epoch": 1.9313725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.6300033230824567, "kl": 0.15588462352752686, "learning_rate": 3.414321200791679e-07, "loss": -0.0031, "num_tokens": 49745786.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5989322662353516, "sampling/importance_sampling_ratio/mean": 1.000009536743164, "sampling/importance_sampling_ratio/min": 0.5671254396438599, "sampling/sampling_logp_difference/max": 0.5671747922897339, "sampling/sampling_logp_difference/mean": 0.01786952093243599, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 194.65625, "completions/mean_terminated_length": 194.65625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3209313750267029, "epoch": 1.9325980392156863, "frac_reward_zero_std": 0.25, "grad_norm": 2.3238252262505257, "kl": 0.1693599373102188, "learning_rate": 3.4075667487415785e-07, "loss": -0.0046, "num_tokens": 49779076.0, "reward": 0.59375, "reward_std": 0.659286618232727, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5972504615783691, "sampling/importance_sampling_ratio/mean": 1.0004091262817383, "sampling/importance_sampling_ratio/min": 0.6081857681274414, "sampling/sampling_logp_difference/max": 0.49727487564086914, "sampling/sampling_logp_difference/mean": 0.016534799709916115, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 190.53125, "completions/mean_terminated_length": 190.53125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.35380131006240845, "epoch": 1.9338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.15518261084076634, "kl": 0.12272585928440094, "learning_rate": 3.4008155292300934e-07, "loss": 0.0013, "num_tokens": 49806102.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5486302375793457, "sampling/importance_sampling_ratio/mean": 0.9994552731513977, "sampling/importance_sampling_ratio/min": 0.6216931343078613, "sampling/sampling_logp_difference/max": 0.4753086566925049, "sampling/sampling_logp_difference/mean": 0.017209038138389587, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 195.265625, "completions/mean_terminated_length": 195.265625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2244930863380432, "epoch": 1.9350490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 1.9183236785939335, "kl": 0.08692613244056702, "learning_rate": 3.3940675559617723e-07, "loss": -0.0734, "num_tokens": 49840839.0, "reward": -0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6320115327835083, "sampling/importance_sampling_ratio/mean": 1.000977635383606, "sampling/importance_sampling_ratio/min": 0.5341154932975769, "sampling/sampling_logp_difference/max": 0.627143144607544, "sampling/sampling_logp_difference/mean": 0.014392497949302197, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 211.53125, "completions/mean_terminated_length": 211.53125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3096480369567871, "epoch": 1.9362745098039216, "frac_reward_zero_std": 0.0, "grad_norm": 2.7361189983841854, "kl": 0.15013748407363892, "learning_rate": 3.3873228426345757e-07, "loss": 0.0301, "num_tokens": 49867225.0, "reward": -0.0625, "reward_std": 0.8220869898796082, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6300376653671265, "sampling/importance_sampling_ratio/mean": 1.0004205703735352, "sampling/importance_sampling_ratio/min": 0.587834358215332, "sampling/sampling_logp_difference/max": 0.5313100814819336, "sampling/sampling_logp_difference/mean": 0.015822414308786392, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 185.03125, "completions/mean_terminated_length": 185.03125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2975696921348572, "epoch": 1.9375, "frac_reward_zero_std": 0.75, "grad_norm": 1.2834407010139355, "kl": 0.1522301435470581, "learning_rate": 3.380581402939841e-07, "loss": 0.0249, "num_tokens": 49892683.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.85191011428833, "sampling/importance_sampling_ratio/mean": 0.9997633099555969, "sampling/importance_sampling_ratio/min": 0.4871048033237457, "sampling/sampling_logp_difference/max": 0.719275951385498, "sampling/sampling_logp_difference/mean": 0.015590054914355278, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 217.609375, "completions/mean_terminated_length": 217.609375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2924651503562927, "epoch": 1.9387254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 1.5024869556632245, "kl": 0.09442883729934692, "learning_rate": 3.373843250562265e-07, "loss": -0.0221, "num_tokens": 49927778.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995254874229431, "sampling/importance_sampling_ratio/min": 0.4735029935836792, "sampling/sampling_logp_difference/max": 0.814455509185791, "sampling/sampling_logp_difference/mean": 0.01673107221722603, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 203.90625, "completions/mean_terminated_length": 203.90625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.23768167197704315, "epoch": 1.9399509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.112366008579369, "kl": 0.14769932627677917, "learning_rate": 3.3671083991798697e-07, "loss": -0.0012, "num_tokens": 49956188.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6751999855041504, "sampling/importance_sampling_ratio/mean": 0.9998273849487305, "sampling/importance_sampling_ratio/min": 0.6054726243019104, "sampling/sampling_logp_difference/max": 0.515932559967041, "sampling/sampling_logp_difference/mean": 0.01382248941808939, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.21476219594478607, "epoch": 1.9411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.04603193232824236, "kl": 0.07713761180639267, "learning_rate": 3.360376862463978e-07, "loss": 0.0008, "num_tokens": 49982236.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001596212387085, "sampling/importance_sampling_ratio/min": 0.5286228656768799, "sampling/sampling_logp_difference/max": 0.7049002647399902, "sampling/sampling_logp_difference/mean": 0.01282772608101368, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 205.71875, "completions/mean_terminated_length": 205.71875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.27220970392227173, "epoch": 1.9424019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.7500272508696701, "kl": 0.08735904097557068, "learning_rate": 3.3536486540791823e-07, "loss": -0.0007, "num_tokens": 50011098.0, "reward": 0.3125, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5705015659332275, "sampling/importance_sampling_ratio/mean": 0.9996979236602783, "sampling/importance_sampling_ratio/min": 0.6129493117332458, "sampling/sampling_logp_difference/max": 0.4894731044769287, "sampling/sampling_logp_difference/mean": 0.013286584988236427, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 181.640625, "completions/mean_terminated_length": 181.640625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2341674566268921, "epoch": 1.9436274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.3454292313143326, "kl": 0.08026061952114105, "learning_rate": 3.3469237876833187e-07, "loss": -0.0013, "num_tokens": 50044403.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002331733703613, "sampling/importance_sampling_ratio/min": 0.21306173503398895, "sampling/sampling_logp_difference/max": 1.546173334121704, "sampling/sampling_logp_difference/mean": 0.013525542803108692, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 207.4375, "completions/mean_terminated_length": 207.4375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.23868905007839203, "epoch": 1.9448529411764706, "frac_reward_zero_std": 0.25, "grad_norm": 2.3329696932606168, "kl": 0.08665573596954346, "learning_rate": 3.340202276927442e-07, "loss": 0.1692, "num_tokens": 50077727.0, "reward": 0.8125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5756419897079468, "sampling/importance_sampling_ratio/mean": 0.9999165534973145, "sampling/importance_sampling_ratio/min": 0.4954255223274231, "sampling/sampling_logp_difference/max": 0.7023382186889648, "sampling/sampling_logp_difference/mean": 0.014149514958262444, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 276.25, "completions/mean_terminated_length": 276.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.32216912508010864, "epoch": 1.946078431372549, "frac_reward_zero_std": 0.25, "grad_norm": 1.6251851960529853, "kl": 0.10423454642295837, "learning_rate": 3.333484135455792e-07, "loss": 0.0141, "num_tokens": 50115519.0, "reward": -0.40625, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5839189291000366, "sampling/importance_sampling_ratio/mean": 1.0003249645233154, "sampling/importance_sampling_ratio/min": 0.5713785290718079, "sampling/sampling_logp_difference/max": 0.5597033500671387, "sampling/sampling_logp_difference/mean": 0.01547469012439251, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 189.578125, "completions/mean_terminated_length": 189.578125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3143174648284912, "epoch": 1.9473039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 2.081170655936288, "kl": 0.10240406543016434, "learning_rate": 3.326769376905769e-07, "loss": 0.0319, "num_tokens": 50161412.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9267932176589966, "sampling/importance_sampling_ratio/mean": 1.0005072355270386, "sampling/importance_sampling_ratio/min": 0.5947045087814331, "sampling/sampling_logp_difference/max": 0.6558570861816406, "sampling/sampling_logp_difference/mean": 0.01755983754992485, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 255.65625, "completions/mean_terminated_length": 255.65625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3196178674697876, "epoch": 1.9485294117647058, "frac_reward_zero_std": 0.25, "grad_norm": 1.792804478170327, "kl": 0.09776392579078674, "learning_rate": 3.3200580149079083e-07, "loss": 0.0003, "num_tokens": 50201534.0, "reward": 0.34375, "reward_std": 0.606805682182312, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.954484462738037, "sampling/importance_sampling_ratio/mean": 0.9998239278793335, "sampling/importance_sampling_ratio/min": 0.5243139863014221, "sampling/sampling_logp_difference/max": 0.6701264381408691, "sampling/sampling_logp_difference/mean": 0.016514722257852554, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 209.828125, "completions/mean_terminated_length": 209.828125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.36181557178497314, "epoch": 1.9497549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.619087280242418, "kl": 0.11058124899864197, "learning_rate": 3.31335006308585e-07, "loss": 0.0166, "num_tokens": 50230595.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6574147939682007, "sampling/importance_sampling_ratio/mean": 1.0001424551010132, "sampling/importance_sampling_ratio/min": 0.6203011274337769, "sampling/sampling_logp_difference/max": 0.5052590370178223, "sampling/sampling_logp_difference/mean": 0.017097918316721916, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 178.5625, "completions/mean_terminated_length": 178.5625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2967524826526642, "epoch": 1.9509803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.9463706558816324, "kl": 0.1831379532814026, "learning_rate": 3.3066455350563115e-07, "loss": 0.0281, "num_tokens": 50257351.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.638442039489746, "sampling/importance_sampling_ratio/mean": 0.9994776844978333, "sampling/importance_sampling_ratio/min": 0.44796988368034363, "sampling/sampling_logp_difference/max": 0.8030292987823486, "sampling/sampling_logp_difference/mean": 0.015726547688245773, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 166.796875, "completions/mean_terminated_length": 166.796875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3216024339199066, "epoch": 1.9522058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.9469362686105434, "kl": 0.12963071465492249, "learning_rate": 3.29994444442906e-07, "loss": 0.0012, "num_tokens": 50285946.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.621140718460083, "sampling/importance_sampling_ratio/mean": 1.000431776046753, "sampling/importance_sampling_ratio/min": 0.6146537065505981, "sampling/sampling_logp_difference/max": 0.4866962432861328, "sampling/sampling_logp_difference/mean": 0.01588505692780018, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.28989577293395996, "epoch": 1.9534313725490198, "frac_reward_zero_std": 0.25, "grad_norm": 1.8948272853844492, "kl": 0.13485756516456604, "learning_rate": 3.2932468048068836e-07, "loss": -0.0265, "num_tokens": 50317930.0, "reward": 0.1875, "reward_std": 0.6531128883361816, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005075931549072, "sampling/importance_sampling_ratio/min": 0.3008410334587097, "sampling/sampling_logp_difference/max": 1.2011733055114746, "sampling/sampling_logp_difference/mean": 0.016114819794893265, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 220.984375, "completions/mean_terminated_length": 220.984375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2438029646873474, "epoch": 1.954656862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.6301769553324, "kl": 0.10240459442138672, "learning_rate": 3.2865526297855694e-07, "loss": 0.0118, "num_tokens": 50353417.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7520787715911865, "sampling/importance_sampling_ratio/mean": 1.0001596212387085, "sampling/importance_sampling_ratio/min": 0.6250287294387817, "sampling/sampling_logp_difference/max": 0.5608029365539551, "sampling/sampling_logp_difference/mean": 0.012727048248052597, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 140.109375, "completions/mean_terminated_length": 140.109375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.24194109439849854, "epoch": 1.9558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 2.382676593859849, "kl": 0.11661958694458008, "learning_rate": 3.2798619329538646e-07, "loss": 0.0123, "num_tokens": 50378544.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6962480545043945, "sampling/importance_sampling_ratio/mean": 1.0009865760803223, "sampling/importance_sampling_ratio/min": 0.5991002321243286, "sampling/sampling_logp_difference/max": 0.528418779373169, "sampling/sampling_logp_difference/mean": 0.014339487068355083, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 198.90625, "completions/mean_terminated_length": 198.90625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3294591009616852, "epoch": 1.9571078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 1.537652087376244, "kl": 0.12539511919021606, "learning_rate": 3.2731747278934623e-07, "loss": 0.0314, "num_tokens": 50411658.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994417428970337, "sampling/importance_sampling_ratio/min": 0.4973803460597992, "sampling/sampling_logp_difference/max": 1.0194251537322998, "sampling/sampling_logp_difference/mean": 0.017773278057575226, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 229.078125, "completions/mean_terminated_length": 229.078125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.24577781558036804, "epoch": 1.9583333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.3640381789741722, "kl": 0.07902765274047852, "learning_rate": 3.266491028178964e-07, "loss": 0.0208, "num_tokens": 50445311.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7745686769485474, "sampling/importance_sampling_ratio/mean": 1.000251054763794, "sampling/importance_sampling_ratio/min": 0.3827075958251953, "sampling/sampling_logp_difference/max": 0.9604840278625488, "sampling/sampling_logp_difference/mean": 0.012837364338338375, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 200.015625, "completions/mean_terminated_length": 200.015625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.20478859543800354, "epoch": 1.9595588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.041420986646954754, "kl": 0.0642724260687828, "learning_rate": 3.2598108473778595e-07, "loss": 0.0006, "num_tokens": 50475504.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6025726795196533, "sampling/importance_sampling_ratio/mean": 0.9999085068702698, "sampling/importance_sampling_ratio/min": 0.6138669848442078, "sampling/sampling_logp_difference/max": 0.4879770278930664, "sampling/sampling_logp_difference/mean": 0.011987491510808468, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 161.59375, "completions/mean_terminated_length": 161.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.32719677686691284, "epoch": 1.9607843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.4560910886561118, "kl": 0.13007999956607819, "learning_rate": 3.253134199050489e-07, "loss": 0.0307, "num_tokens": 50503574.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5920594930648804, "sampling/importance_sampling_ratio/mean": 0.9995177388191223, "sampling/importance_sampling_ratio/min": 0.5006640553474426, "sampling/sampling_logp_difference/max": 0.6918199062347412, "sampling/sampling_logp_difference/mean": 0.016694029793143272, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 168.484375, "completions/mean_terminated_length": 168.484375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2404722273349762, "epoch": 1.9620098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.20710936380695089, "kl": 0.1067638099193573, "learning_rate": 3.2464610967500273e-07, "loss": 0.0011, "num_tokens": 50534469.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7070449590682983, "sampling/importance_sampling_ratio/mean": 1.0011273622512817, "sampling/importance_sampling_ratio/min": 0.6255688667297363, "sampling/sampling_logp_difference/max": 0.5347638130187988, "sampling/sampling_logp_difference/mean": 0.014591801911592484, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2132427841424942, "epoch": 1.9632352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 1.303006906849855, "kl": 0.07793626189231873, "learning_rate": 3.239791554022449e-07, "loss": 0.0146, "num_tokens": 50565061.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6684820652008057, "sampling/importance_sampling_ratio/mean": 0.9997797012329102, "sampling/importance_sampling_ratio/min": 0.48260486125946045, "sampling/sampling_logp_difference/max": 0.7285571098327637, "sampling/sampling_logp_difference/mean": 0.013838795945048332, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 162.890625, "completions/mean_terminated_length": 162.890625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2564378082752228, "epoch": 1.9644607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.294506117895636, "kl": 0.09519857168197632, "learning_rate": 3.233125584406505e-07, "loss": -0.0097, "num_tokens": 50594606.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.988013505935669, "sampling/importance_sampling_ratio/mean": 1.0003509521484375, "sampling/importance_sampling_ratio/min": 0.6083920001983643, "sampling/sampling_logp_difference/max": 0.6871359348297119, "sampling/sampling_logp_difference/mean": 0.01461451593786478, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 172.53125, "completions/mean_terminated_length": 172.53125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1933905929327011, "epoch": 1.965686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.039783349055590174, "kl": 0.06623965501785278, "learning_rate": 3.226463201433688e-07, "loss": 0.0006, "num_tokens": 50625808.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8062872886657715, "sampling/importance_sampling_ratio/mean": 0.9996256828308105, "sampling/importance_sampling_ratio/min": 0.4821670353412628, "sampling/sampling_logp_difference/max": 0.7294646501541138, "sampling/sampling_logp_difference/mean": 0.01147723849862814, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 199.53125, "completions/mean_terminated_length": 199.53125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2979511618614197, "epoch": 1.9669117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.9639861993875047, "kl": 0.1098884642124176, "learning_rate": 3.219804418628216e-07, "loss": -0.0108, "num_tokens": 50658978.0, "reward": 0.71875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7011988162994385, "sampling/importance_sampling_ratio/mean": 0.999648928642273, "sampling/importance_sampling_ratio/min": 0.4642985165119171, "sampling/sampling_logp_difference/max": 0.7672275304794312, "sampling/sampling_logp_difference/mean": 0.01718880608677864, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 166.421875, "completions/mean_terminated_length": 166.421875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.21918423473834991, "epoch": 1.968137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.1573657034302032, "kl": 0.07675355672836304, "learning_rate": 3.2131492495069965e-07, "loss": 0.022, "num_tokens": 50690877.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6303189992904663, "sampling/importance_sampling_ratio/mean": 1.0002782344818115, "sampling/importance_sampling_ratio/min": 0.33144819736480713, "sampling/sampling_logp_difference/max": 1.1042838096618652, "sampling/sampling_logp_difference/mean": 0.012949703261256218, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 199.28125, "completions/mean_terminated_length": 199.28125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2782513201236725, "epoch": 1.969362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.3616340941440406, "kl": 0.09660226106643677, "learning_rate": 3.206497707579598e-07, "loss": -0.068, "num_tokens": 50723055.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992948770523071, "sampling/importance_sampling_ratio/min": 0.5910187363624573, "sampling/sampling_logp_difference/max": 0.9737603664398193, "sampling/sampling_logp_difference/mean": 0.015251495875418186, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 182.84375, "completions/mean_terminated_length": 182.84375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.22558334469795227, "epoch": 1.9705882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.5858964919685112, "kl": 0.07987121492624283, "learning_rate": 3.199849806348233e-07, "loss": -0.0067, "num_tokens": 50753557.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4054455757141113, "sampling/importance_sampling_ratio/mean": 0.9998422861099243, "sampling/importance_sampling_ratio/min": 0.6121768355369568, "sampling/sampling_logp_difference/max": 0.4907341003417969, "sampling/sampling_logp_difference/mean": 0.013224356807768345, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26014161109924316, "epoch": 1.971813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.09850640337484572, "kl": 0.12250128388404846, "learning_rate": 3.1932055593077166e-07, "loss": 0.0012, "num_tokens": 50777813.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6067885160446167, "sampling/importance_sampling_ratio/mean": 0.999258279800415, "sampling/importance_sampling_ratio/min": 0.6246219277381897, "sampling/sampling_logp_difference/max": 0.47423744201660156, "sampling/sampling_logp_difference/mean": 0.015161692164838314, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 215.859375, "completions/mean_terminated_length": 215.859375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.27312037348747253, "epoch": 1.9730392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.1018882184775826, "kl": 0.06669708341360092, "learning_rate": 3.186564979945453e-07, "loss": 0.0007, "num_tokens": 50810908.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.607377052307129, "sampling/importance_sampling_ratio/mean": 0.9995588064193726, "sampling/importance_sampling_ratio/min": 0.17369602620601654, "sampling/sampling_logp_difference/max": 1.75044846534729, "sampling/sampling_logp_difference/mean": 0.014527924358844757, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 253.8125, "completions/mean_terminated_length": 253.8125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.21349847316741943, "epoch": 1.9742647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 1.2571214131193265, "kl": 0.05080464482307434, "learning_rate": 3.179928081741394e-07, "loss": -0.0023, "num_tokens": 50853984.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5920687913894653, "sampling/importance_sampling_ratio/mean": 1.0000137090682983, "sampling/importance_sampling_ratio/min": 0.6220495700836182, "sampling/sampling_logp_difference/max": 0.4747354984283447, "sampling/sampling_logp_difference/mean": 0.012062931433320045, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2812146246433258, "epoch": 1.9754901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.041661626205210384, "kl": 0.07609853148460388, "learning_rate": 3.173294878168025e-07, "loss": 0.0008, "num_tokens": 50885608.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5630258321762085, "sampling/importance_sampling_ratio/mean": 1.0003044605255127, "sampling/importance_sampling_ratio/min": 0.6308006644248962, "sampling/sampling_logp_difference/max": 0.46076536178588867, "sampling/sampling_logp_difference/mean": 0.014220191165804863, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 182.390625, "completions/mean_terminated_length": 182.390625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3182164430618286, "epoch": 1.9767156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 1.8922180588213584, "kl": 0.11925086379051208, "learning_rate": 3.166665382690327e-07, "loss": 0.0473, "num_tokens": 50916465.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002799034118652, "sampling/importance_sampling_ratio/min": 0.40446823835372925, "sampling/sampling_logp_difference/max": 1.0331223011016846, "sampling/sampling_logp_difference/mean": 0.019244685769081116, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 208.234375, "completions/mean_terminated_length": 208.234375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.23336933553218842, "epoch": 1.9779411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.3085149064833899, "kl": 0.07907348871231079, "learning_rate": 3.1600396087657586e-07, "loss": 0.0517, "num_tokens": 50945344.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4848120212554932, "sampling/importance_sampling_ratio/mean": 1.0001044273376465, "sampling/importance_sampling_ratio/min": 0.5559201240539551, "sampling/sampling_logp_difference/max": 0.5871306657791138, "sampling/sampling_logp_difference/mean": 0.012774837203323841, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2610890865325928, "epoch": 1.9791666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.040694708580853736, "kl": 0.07112009823322296, "learning_rate": 3.153417569844219e-07, "loss": 0.0007, "num_tokens": 50982992.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6413822174072266, "sampling/importance_sampling_ratio/mean": 0.9998020529747009, "sampling/importance_sampling_ratio/min": 0.486931711435318, "sampling/sampling_logp_difference/max": 0.7196313142776489, "sampling/sampling_logp_difference/mean": 0.014887130819261074, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 196.34375, "completions/mean_terminated_length": 196.34375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2693784534931183, "epoch": 1.9803921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.2412155777803517, "kl": 0.09714090079069138, "learning_rate": 3.1467992793680267e-07, "loss": -0.0023, "num_tokens": 51019750.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7114192247390747, "sampling/importance_sampling_ratio/mean": 1.0001533031463623, "sampling/importance_sampling_ratio/min": 0.44735607504844666, "sampling/sampling_logp_difference/max": 0.8044005036354065, "sampling/sampling_logp_difference/mean": 0.015305680222809315, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 154.328125, "completions/mean_terminated_length": 154.328125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.24011406302452087, "epoch": 1.9816176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.330582147848195, "kl": 0.1220964565873146, "learning_rate": 3.140184750771895e-07, "loss": 0.0123, "num_tokens": 51046587.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4753892421722412, "sampling/importance_sampling_ratio/mean": 1.0000015497207642, "sampling/importance_sampling_ratio/min": 0.605623185634613, "sampling/sampling_logp_difference/max": 0.5014972686767578, "sampling/sampling_logp_difference/mean": 0.013417389243841171, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 178.828125, "completions/mean_terminated_length": 178.828125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2093338817358017, "epoch": 1.982843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.053016830426800644, "kl": 0.08074681460857391, "learning_rate": 3.133573997482896e-07, "loss": 0.0007, "num_tokens": 51081232.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996959567070007, "sampling/importance_sampling_ratio/min": 0.41164177656173706, "sampling/sampling_logp_difference/max": 0.9979722499847412, "sampling/sampling_logp_difference/mean": 0.014013232663273811, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 157.359375, "completions/mean_terminated_length": 157.359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.23903775215148926, "epoch": 1.9840686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.04974164464841695, "kl": 0.07109355926513672, "learning_rate": 3.1269670329204393e-07, "loss": 0.0007, "num_tokens": 51111383.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6349729299545288, "sampling/importance_sampling_ratio/mean": 0.9996782541275024, "sampling/importance_sampling_ratio/min": 0.33583348989486694, "sampling/sampling_logp_difference/max": 1.091139793395996, "sampling/sampling_logp_difference/mean": 0.014817346818745136, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 139.53125, "completions/mean_terminated_length": 139.53125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.22959363460540771, "epoch": 1.9852941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 1.527513624288118, "kl": 0.0815734714269638, "learning_rate": 3.1203638704962465e-07, "loss": -0.0098, "num_tokens": 51137593.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001217126846313, "sampling/importance_sampling_ratio/min": 0.4106025695800781, "sampling/sampling_logp_difference/max": 1.313234806060791, "sampling/sampling_logp_difference/mean": 0.014172550290822983, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 215.984375, "completions/mean_terminated_length": 215.984375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2980981469154358, "epoch": 1.9865196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.3724718548416854, "kl": 0.15632514655590057, "learning_rate": 3.11376452361432e-07, "loss": 0.0159, "num_tokens": 51166440.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.597182273864746, "sampling/importance_sampling_ratio/mean": 0.9992493391036987, "sampling/importance_sampling_ratio/min": 0.6223132014274597, "sampling/sampling_logp_difference/max": 0.47431182861328125, "sampling/sampling_logp_difference/mean": 0.016231466084718704, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 203.078125, "completions/mean_terminated_length": 203.078125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.18610015511512756, "epoch": 1.9877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.054405014400212094, "kl": 0.07010403275489807, "learning_rate": 3.107169005670912e-07, "loss": 0.0007, "num_tokens": 51194733.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6613489389419556, "sampling/importance_sampling_ratio/mean": 0.9995604753494263, "sampling/importance_sampling_ratio/min": 0.3985402286052704, "sampling/sampling_logp_difference/max": 0.9199469089508057, "sampling/sampling_logp_difference/mean": 0.012981155887246132, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 137.703125, "completions/mean_terminated_length": 137.703125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.20478081703186035, "epoch": 1.9889705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.933138185478157, "kl": 0.10997778922319412, "learning_rate": 3.100577330054508e-07, "loss": -0.0035, "num_tokens": 51224362.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5855019092559814, "sampling/importance_sampling_ratio/mean": 0.99997878074646, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.4679851531982422, "sampling/sampling_logp_difference/mean": 0.012617578729987144, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.31237688660621643, "epoch": 1.9901960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 3.1662963505110064, "kl": 0.14189650118350983, "learning_rate": 3.0939895101457914e-07, "loss": -0.0092, "num_tokens": 51251210.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6554615497589111, "sampling/importance_sampling_ratio/mean": 1.00002121925354, "sampling/importance_sampling_ratio/min": 0.6378210783004761, "sampling/sampling_logp_difference/max": 0.5040798187255859, "sampling/sampling_logp_difference/mean": 0.015452057123184204, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 285.578125, "completions/mean_terminated_length": 285.578125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.16210177540779114, "epoch": 1.991421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.029328000825859252, "kl": 0.05415792763233185, "learning_rate": 3.087405559317622e-07, "loss": 0.0005, "num_tokens": 51286975.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.761606216430664, "sampling/importance_sampling_ratio/mean": 0.9996249675750732, "sampling/importance_sampling_ratio/min": 0.5097636580467224, "sampling/sampling_logp_difference/max": 0.6738080978393555, "sampling/sampling_logp_difference/mean": 0.009517800062894821, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 197.203125, "completions/mean_terminated_length": 197.203125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.26196715235710144, "epoch": 1.9926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 2.022580389597765, "kl": 0.10579611361026764, "learning_rate": 3.0808254909349986e-07, "loss": -0.0517, "num_tokens": 51316620.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000226497650146, "sampling/importance_sampling_ratio/min": 0.5474581718444824, "sampling/sampling_logp_difference/max": 0.9265649318695068, "sampling/sampling_logp_difference/mean": 0.01420481875538826, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 225.484375, "completions/mean_terminated_length": 225.484375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.24070894718170166, "epoch": 1.9938725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.03621637216943933, "kl": 0.05783402919769287, "learning_rate": 3.0742493183550454e-07, "loss": 0.0005, "num_tokens": 51352907.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9146305322647095, "sampling/importance_sampling_ratio/mean": 1.0004422664642334, "sampling/importance_sampling_ratio/min": 0.5428375005722046, "sampling/sampling_logp_difference/max": 0.6495246887207031, "sampling/sampling_logp_difference/mean": 0.015581740997731686, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 206.3125, "completions/mean_terminated_length": 206.3125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2880558967590332, "epoch": 1.9950980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 2.2085552696095494, "kl": 0.10221179574728012, "learning_rate": 3.0676770549269786e-07, "loss": 0.009, "num_tokens": 51387359.0, "reward": 0.375, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.9623781442642212, "sampling/importance_sampling_ratio/mean": 0.9996896386146545, "sampling/importance_sampling_ratio/min": 0.5363050699234009, "sampling/sampling_logp_difference/max": 0.6741571426391602, "sampling/sampling_logp_difference/mean": 0.016578059643507004, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.24638226628303528, "epoch": 1.9963235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.1180435070672479, "kl": 0.09617805480957031, "learning_rate": 3.0611087139920717e-07, "loss": 0.0039, "num_tokens": 51416639.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6157478094100952, "sampling/importance_sampling_ratio/mean": 1.0001851320266724, "sampling/importance_sampling_ratio/min": 0.49470096826553345, "sampling/sampling_logp_difference/max": 0.7038018703460693, "sampling/sampling_logp_difference/mean": 0.015872016549110413, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 178.890625, "completions/mean_terminated_length": 178.890625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2153649926185608, "epoch": 1.9975490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.1406778974066792, "kl": 0.06773126870393753, "learning_rate": 3.054544308883643e-07, "loss": 0.0007, "num_tokens": 51447352.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5612868070602417, "sampling/importance_sampling_ratio/mean": 0.9997575283050537, "sampling/importance_sampling_ratio/min": 0.611184298992157, "sampling/sampling_logp_difference/max": 0.4923567771911621, "sampling/sampling_logp_difference/mean": 0.013232271187007427, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.24145972728729248, "epoch": 1.9987745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 1.4198898444810162, "kl": 0.058622464537620544, "learning_rate": 3.0479838529270186e-07, "loss": -0.029, "num_tokens": 51476880.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7522567510604858, "sampling/importance_sampling_ratio/mean": 1.0002727508544922, "sampling/importance_sampling_ratio/min": 0.48241639137268066, "sampling/sampling_logp_difference/max": 0.728947639465332, "sampling/sampling_logp_difference/mean": 0.01329050399363041, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 242.71875, "completions/mean_terminated_length": 242.71875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2610500454902649, "epoch": 2.0, "frac_reward_zero_std": 0.5, "grad_norm": 1.6373740660379532, "kl": 0.11630761623382568, "learning_rate": 3.0414273594395103e-07, "loss": 0.0245, "num_tokens": 51511198.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.784226894378662, "sampling/importance_sampling_ratio/mean": 0.9999319911003113, "sampling/importance_sampling_ratio/min": 0.631510317325592, "sampling/sampling_logp_difference/max": 0.5789852142333984, "sampling/sampling_logp_difference/mean": 0.013508956879377365, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2706983685493469, "epoch": 2.0012254901960786, "frac_reward_zero_std": 0.75, "grad_norm": 1.3157672512576266, "kl": 0.13925369083881378, "learning_rate": 3.034874841730382e-07, "loss": 0.0319, "num_tokens": 51544934.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.9059497117996216, "sampling/importance_sampling_ratio/mean": 0.9994834661483765, "sampling/importance_sampling_ratio/min": 0.44495993852615356, "sampling/sampling_logp_difference/max": 0.8097710609436035, "sampling/sampling_logp_difference/mean": 0.015468169935047626, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24652785062789917, "epoch": 2.002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.3515748376029462, "kl": 0.1129264310002327, "learning_rate": 3.0283263131008307e-07, "loss": -0.0103, "num_tokens": 51573478.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999708533287048, "sampling/importance_sampling_ratio/min": 0.4943215250968933, "sampling/sampling_logp_difference/max": 0.7337357997894287, "sampling/sampling_logp_difference/mean": 0.016296112909913063, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 205.953125, "completions/mean_terminated_length": 205.953125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.307706356048584, "epoch": 2.0036764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.0952524559695727, "kl": 0.0964609831571579, "learning_rate": 3.0217817868439545e-07, "loss": 0.0052, "num_tokens": 51601283.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6070095300674438, "sampling/importance_sampling_ratio/mean": 0.9998152256011963, "sampling/importance_sampling_ratio/min": 0.5948570966720581, "sampling/sampling_logp_difference/max": 0.5194340944290161, "sampling/sampling_logp_difference/mean": 0.015333171933889389, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 144.859375, "completions/mean_terminated_length": 144.859375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.20275729894638062, "epoch": 2.0049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.10984783925805666, "kl": 0.07397639751434326, "learning_rate": 3.015241276244729e-07, "loss": 0.0007, "num_tokens": 51627770.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.627306342124939, "sampling/importance_sampling_ratio/mean": 1.0005347728729248, "sampling/importance_sampling_ratio/min": 0.44181501865386963, "sampling/sampling_logp_difference/max": 0.816864013671875, "sampling/sampling_logp_difference/mean": 0.013549655675888062, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20195108652114868, "epoch": 2.0061274509803924, "frac_reward_zero_std": 0.75, "grad_norm": 1.3513810206626793, "kl": 0.08534161746501923, "learning_rate": 3.0087047945799724e-07, "loss": 0.0043, "num_tokens": 51654682.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6472231149673462, "sampling/importance_sampling_ratio/mean": 1.0002919435501099, "sampling/importance_sampling_ratio/min": 0.3821452260017395, "sampling/sampling_logp_difference/max": 0.9619545936584473, "sampling/sampling_logp_difference/mean": 0.012385329231619835, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 186.296875, "completions/mean_terminated_length": 186.296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24972376227378845, "epoch": 2.0073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.6237297199685563, "kl": 0.11721492558717728, "learning_rate": 3.002172355118331e-07, "loss": 0.004, "num_tokens": 51686381.0, "reward": 0.5625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4443892240524292, "sampling/importance_sampling_ratio/mean": 1.0004124641418457, "sampling/importance_sampling_ratio/min": 0.6345492601394653, "sampling/sampling_logp_difference/max": 0.45484042167663574, "sampling/sampling_logp_difference/mean": 0.013366533443331718, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 227.734375, "completions/mean_terminated_length": 227.734375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.35807353258132935, "epoch": 2.008578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.582531226173086, "kl": 0.09623866528272629, "learning_rate": 2.995643971120243e-07, "loss": 0.0143, "num_tokens": 51720076.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7915570735931396, "sampling/importance_sampling_ratio/mean": 1.0000855922698975, "sampling/importance_sampling_ratio/min": 0.5658215284347534, "sampling/sampling_logp_difference/max": 0.5830850601196289, "sampling/sampling_logp_difference/mean": 0.017493925988674164, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 157.59375, "completions/mean_terminated_length": 157.59375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.20872214436531067, "epoch": 2.0098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.527939124978404, "kl": 0.1340126097202301, "learning_rate": 2.9891196558379126e-07, "loss": 0.0012, "num_tokens": 51747458.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4740839004516602, "sampling/importance_sampling_ratio/mean": 1.0000025033950806, "sampling/importance_sampling_ratio/min": 0.39433911442756653, "sampling/sampling_logp_difference/max": 0.9305441379547119, "sampling/sampling_logp_difference/mean": 0.013612005859613419, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.20791053771972656, "epoch": 2.011029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.32638084040683657, "kl": 0.1111663281917572, "learning_rate": 2.9825994225152884e-07, "loss": 0.0011, "num_tokens": 51775074.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8181694746017456, "sampling/importance_sampling_ratio/mean": 0.999299168586731, "sampling/importance_sampling_ratio/min": 0.45959511399269104, "sampling/sampling_logp_difference/max": 0.7774093151092529, "sampling/sampling_logp_difference/mean": 0.013892569579184055, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 165.78125, "completions/mean_terminated_length": 165.78125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2592073380947113, "epoch": 2.0122549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 2.3884884387292047, "kl": 0.09146402031183243, "learning_rate": 2.976083284388031e-07, "loss": 0.0709, "num_tokens": 51804052.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4387812614440918, "sampling/importance_sampling_ratio/mean": 0.9987965226173401, "sampling/importance_sampling_ratio/min": 0.5887529253959656, "sampling/sampling_logp_difference/max": 0.5297486782073975, "sampling/sampling_logp_difference/mean": 0.015599111095070839, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 270.96875, "completions/mean_terminated_length": 270.96875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.2273353636264801, "epoch": 2.013480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.0405693698287233, "kl": 0.06104673817753792, "learning_rate": 2.9695712546834885e-07, "loss": -0.0192, "num_tokens": 51849042.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6454516649246216, "sampling/importance_sampling_ratio/mean": 0.9998835325241089, "sampling/importance_sampling_ratio/min": 0.1766490489244461, "sampling/sampling_logp_difference/max": 1.7335902452468872, "sampling/sampling_logp_difference/mean": 0.013499276712536812, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 180.59375, "completions/mean_terminated_length": 180.59375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.21701395511627197, "epoch": 2.014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.050001106355846855, "kl": 0.08167947828769684, "learning_rate": 2.9630633466206655e-07, "loss": 0.0008, "num_tokens": 51881640.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6478513479232788, "sampling/importance_sampling_ratio/mean": 1.0004024505615234, "sampling/importance_sampling_ratio/min": 0.39870205521583557, "sampling/sampling_logp_difference/max": 0.9195408821105957, "sampling/sampling_logp_difference/mean": 0.014474079012870789, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 165.53125, "completions/mean_terminated_length": 165.53125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.25972750782966614, "epoch": 2.0159313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.476203307790272, "kl": 0.12379482388496399, "learning_rate": 2.9565595734102043e-07, "loss": -0.0181, "num_tokens": 51909386.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6418057680130005, "sampling/importance_sampling_ratio/mean": 1.0005497932434082, "sampling/importance_sampling_ratio/min": 0.5912330746650696, "sampling/sampling_logp_difference/max": 0.5255449414253235, "sampling/sampling_logp_difference/mean": 0.016285490244627, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 173.53125, "completions/mean_terminated_length": 173.53125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.22746597230434418, "epoch": 2.017156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.2127214914639128, "kl": 0.12575247883796692, "learning_rate": 2.950059948254355e-07, "loss": -0.0012, "num_tokens": 51938028.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999560117721558, "sampling/importance_sampling_ratio/min": 0.4987714886665344, "sampling/sampling_logp_difference/max": 0.8758199214935303, "sampling/sampling_logp_difference/mean": 0.01391543261706829, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2744201421737671, "epoch": 2.0183823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.5979122875807548, "kl": 0.06626914441585541, "learning_rate": 2.943564484346943e-07, "loss": 0.0352, "num_tokens": 51972324.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6574277877807617, "sampling/importance_sampling_ratio/mean": 1.0007280111312866, "sampling/importance_sampling_ratio/min": 0.5644875764846802, "sampling/sampling_logp_difference/max": 0.5718369483947754, "sampling/sampling_logp_difference/mean": 0.013680658303201199, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 182.65625, "completions/mean_terminated_length": 182.65625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2576046586036682, "epoch": 2.019607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 1.6970821857781622, "kl": 0.10263028740882874, "learning_rate": 2.937073194873348e-07, "loss": -0.0233, "num_tokens": 52002846.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999614953994751, "sampling/importance_sampling_ratio/min": 0.6093730926513672, "sampling/sampling_logp_difference/max": 0.7053864002227783, "sampling/sampling_logp_difference/mean": 0.015043235383927822, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 217.34375, "completions/mean_terminated_length": 217.34375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3107795715332031, "epoch": 2.0208333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.045661501611413365, "kl": 0.07256503403186798, "learning_rate": 2.930586093010477e-07, "loss": 0.0007, "num_tokens": 52032532.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4629426002502441, "sampling/importance_sampling_ratio/mean": 0.9996424317359924, "sampling/importance_sampling_ratio/min": 0.4803723096847534, "sampling/sampling_logp_difference/max": 0.7331938743591309, "sampling/sampling_logp_difference/mean": 0.015414186753332615, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 226.890625, "completions/mean_terminated_length": 226.890625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3023676872253418, "epoch": 2.0220588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.5121507858951995, "kl": 0.12815862894058228, "learning_rate": 2.9241031919267363e-07, "loss": -0.0428, "num_tokens": 52061133.0, "reward": 0.4375, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.504328966140747, "sampling/importance_sampling_ratio/mean": 0.9998545050621033, "sampling/importance_sampling_ratio/min": 0.14129947125911713, "sampling/sampling_logp_difference/max": 1.9568736553192139, "sampling/sampling_logp_difference/mean": 0.01576872542500496, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 204.53125, "completions/mean_terminated_length": 204.53125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.33197981119155884, "epoch": 2.0232843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 1.7555989411861663, "kl": 0.11538799107074738, "learning_rate": 2.917624504782006e-07, "loss": 0.0021, "num_tokens": 52098767.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996583461761475, "sampling/importance_sampling_ratio/min": 0.3384450078010559, "sampling/sampling_logp_difference/max": 1.0833935737609863, "sampling/sampling_logp_difference/mean": 0.01745392009615898, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 150.171875, "completions/mean_terminated_length": 150.171875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2159707099199295, "epoch": 2.0245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.05194142319735925, "kl": 0.07415338605642319, "learning_rate": 2.911150044727605e-07, "loss": 0.0007, "num_tokens": 52131466.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6623896360397339, "sampling/importance_sampling_ratio/mean": 1.0000048875808716, "sampling/importance_sampling_ratio/min": 0.5678412318229675, "sampling/sampling_logp_difference/max": 0.5659134387969971, "sampling/sampling_logp_difference/mean": 0.014511508867144585, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 142.234375, "completions/mean_terminated_length": 142.234375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.194686159491539, "epoch": 2.025735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.6919832040340759, "kl": 0.07016637176275253, "learning_rate": 2.9046798249062824e-07, "loss": -0.0052, "num_tokens": 52161481.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6222556829452515, "sampling/importance_sampling_ratio/mean": 1.0002816915512085, "sampling/importance_sampling_ratio/min": 0.5483796000480652, "sampling/sampling_logp_difference/max": 0.6007875204086304, "sampling/sampling_logp_difference/mean": 0.013081444427371025, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 207.828125, "completions/mean_terminated_length": 207.828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.25522762537002563, "epoch": 2.0269607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.06765847310099816, "kl": 0.0819954201579094, "learning_rate": 2.898213858452173e-07, "loss": 0.0008, "num_tokens": 52193854.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9768575429916382, "sampling/importance_sampling_ratio/mean": 0.9995952248573303, "sampling/importance_sampling_ratio/min": 0.37231582403182983, "sampling/sampling_logp_difference/max": 0.9880127906799316, "sampling/sampling_logp_difference/mean": 0.015854569151997566, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 180.421875, "completions/mean_terminated_length": 180.421875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.23425695300102234, "epoch": 2.028186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.988364758623323, "kl": 0.09826873242855072, "learning_rate": 2.891752158490778e-07, "loss": -0.0345, "num_tokens": 52222905.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4792793989181519, "sampling/importance_sampling_ratio/mean": 0.99944669008255, "sampling/importance_sampling_ratio/min": 0.5910207033157349, "sampling/sampling_logp_difference/max": 0.5259042382240295, "sampling/sampling_logp_difference/mean": 0.012566267512738705, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 288.609375, "completions/mean_terminated_length": 288.609375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.29428747296333313, "epoch": 2.0294117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.109729955289726, "kl": 0.10290320217609406, "learning_rate": 2.8852947381389405e-07, "loss": 0.0009, "num_tokens": 52261184.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999971866607666, "sampling/importance_sampling_ratio/min": 0.4756595194339752, "sampling/sampling_logp_difference/max": 0.7430529594421387, "sampling/sampling_logp_difference/mean": 0.014198487624526024, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.26211094856262207, "epoch": 2.030637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.1851833220447131, "kl": 0.06994898617267609, "learning_rate": 2.8788416105048117e-07, "loss": 0.0007, "num_tokens": 52298632.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5943752527236938, "sampling/importance_sampling_ratio/mean": 1.000478982925415, "sampling/importance_sampling_ratio/min": 0.6649942994117737, "sampling/sampling_logp_difference/max": 0.46648192405700684, "sampling/sampling_logp_difference/mean": 0.014292774721980095, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2893081307411194, "epoch": 2.031862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.480910825526405, "kl": 0.0972118228673935, "learning_rate": 2.8723927886878396e-07, "loss": -0.0385, "num_tokens": 52334032.0, "reward": 0.65625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6980862617492676, "sampling/importance_sampling_ratio/mean": 0.9996001124382019, "sampling/importance_sampling_ratio/min": 0.4472591280937195, "sampling/sampling_logp_difference/max": 0.804617166519165, "sampling/sampling_logp_difference/mean": 0.01658753678202629, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 137.859375, "completions/mean_terminated_length": 137.859375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.18066023290157318, "epoch": 2.0330882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.06619401552970619, "kl": 0.06539852917194366, "learning_rate": 2.865948285778713e-07, "loss": 0.0007, "num_tokens": 52355015.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8713295459747314, "sampling/importance_sampling_ratio/mean": 1.0002483129501343, "sampling/importance_sampling_ratio/min": 0.16636940836906433, "sampling/sampling_logp_difference/max": 1.7935445308685303, "sampling/sampling_logp_difference/mean": 0.012770957313477993, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 203.109375, "completions/mean_terminated_length": 203.109375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2152371108531952, "epoch": 2.034313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.309699021857678, "kl": 0.05949123576283455, "learning_rate": 2.8595081148593737e-07, "loss": 0.008, "num_tokens": 52385950.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6382622718811035, "sampling/importance_sampling_ratio/mean": 0.9996830821037292, "sampling/importance_sampling_ratio/min": 0.6056292653083801, "sampling/sampling_logp_difference/max": 0.5014872550964355, "sampling/sampling_logp_difference/mean": 0.012422558851540089, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 196.859375, "completions/mean_terminated_length": 196.859375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.26284027099609375, "epoch": 2.0355392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0482797283796569, "kl": 0.091962069272995, "learning_rate": 2.8530722890029534e-07, "loss": 0.0009, "num_tokens": 52414773.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6307573318481445, "sampling/importance_sampling_ratio/mean": 0.9998902082443237, "sampling/importance_sampling_ratio/min": 0.2332911193370819, "sampling/sampling_logp_difference/max": 1.4554681777954102, "sampling/sampling_logp_difference/mean": 0.015172924846410751, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 178.046875, "completions/mean_terminated_length": 178.046875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2818187475204468, "epoch": 2.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.7199197507014743, "kl": 0.13522930443286896, "learning_rate": 2.8466408212737776e-07, "loss": 0.0245, "num_tokens": 52441512.0, "reward": 0.21875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.705733299255371, "sampling/importance_sampling_ratio/mean": 1.0003222227096558, "sampling/importance_sampling_ratio/min": 0.62389075756073, "sampling/sampling_logp_difference/max": 0.5339951515197754, "sampling/sampling_logp_difference/mean": 0.014451962895691395, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2337619662284851, "epoch": 2.0379901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 7.369306100618678, "kl": 0.10952450335025787, "learning_rate": 2.840213724727315e-07, "loss": 0.0113, "num_tokens": 52473832.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 1.000237226486206, "sampling/importance_sampling_ratio/min": 0.010155175812542439, "sampling/sampling_logp_difference/max": 4.589771747589111, "sampling/sampling_logp_difference/mean": 0.012999322265386581, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 258.328125, "completions/mean_terminated_length": 258.328125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.24671608209609985, "epoch": 2.0392156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.085051997504276, "kl": 0.06979397684335709, "learning_rate": 2.8337910124101625e-07, "loss": -0.0498, "num_tokens": 52506429.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002784729003906, "sampling/importance_sampling_ratio/min": 0.35105717182159424, "sampling/sampling_logp_difference/max": 1.5489122867584229, "sampling/sampling_logp_difference/mean": 0.014655955135822296, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 207.640625, "completions/mean_terminated_length": 207.640625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2997831404209137, "epoch": 2.0404411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.323549977186844, "kl": 0.18803086876869202, "learning_rate": 2.8273726973600254e-07, "loss": 0.0101, "num_tokens": 52540662.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.99234139919281, "sampling/importance_sampling_ratio/mean": 1.0006390810012817, "sampling/importance_sampling_ratio/min": 0.5900681018829346, "sampling/sampling_logp_difference/max": 0.6893105506896973, "sampling/sampling_logp_difference/mean": 0.015998469665646553, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 191.390625, "completions/mean_terminated_length": 191.390625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.21080835163593292, "epoch": 2.0416666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.056386538308938354, "kl": 0.06706511229276657, "learning_rate": 2.8209587926056687e-07, "loss": 0.0007, "num_tokens": 52573439.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.566491961479187, "sampling/importance_sampling_ratio/mean": 1.0001158714294434, "sampling/importance_sampling_ratio/min": 0.5505178570747375, "sampling/sampling_logp_difference/max": 0.5968958139419556, "sampling/sampling_logp_difference/mean": 0.012801194563508034, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 206.6875, "completions/mean_terminated_length": 206.6875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2314741015434265, "epoch": 2.042892156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.5137123224282574, "kl": 0.11053841561079025, "learning_rate": 2.8145493111669183e-07, "loss": -0.0041, "num_tokens": 52602443.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5832947492599487, "sampling/importance_sampling_ratio/mean": 0.999763011932373, "sampling/importance_sampling_ratio/min": 0.29653459787368774, "sampling/sampling_logp_difference/max": 1.2155914306640625, "sampling/sampling_logp_difference/mean": 0.01283535547554493, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 261.234375, "completions/mean_terminated_length": 261.234375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.21586529910564423, "epoch": 2.0441176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.04806692302841265, "kl": 0.058751266449689865, "learning_rate": 2.808144266054612e-07, "loss": 0.0006, "num_tokens": 52639386.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6159850358963013, "sampling/importance_sampling_ratio/mean": 0.9997762441635132, "sampling/importance_sampling_ratio/min": 0.5676490068435669, "sampling/sampling_logp_difference/max": 0.5662519931793213, "sampling/sampling_logp_difference/mean": 0.011225221678614616, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 256.6875, "completions/mean_terminated_length": 256.6875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.27154284715652466, "epoch": 2.045343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.8431077027344933, "kl": 0.08351112902164459, "learning_rate": 2.80174367027059e-07, "loss": 0.0692, "num_tokens": 52670438.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6037511825561523, "sampling/importance_sampling_ratio/mean": 0.9998252391815186, "sampling/importance_sampling_ratio/min": 0.5498784780502319, "sampling/sampling_logp_difference/max": 0.5980579853057861, "sampling/sampling_logp_difference/mean": 0.013166810385882854, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 193.40625, "completions/mean_terminated_length": 193.40625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.30176496505737305, "epoch": 2.0465686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 1.8367215287667253, "kl": 0.14671970903873444, "learning_rate": 2.795347536807653e-07, "loss": 0.0144, "num_tokens": 52696944.0, "reward": -0.21875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.568989872932434, "sampling/importance_sampling_ratio/mean": 1.0002939701080322, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.5662540197372437, "sampling/sampling_logp_difference/mean": 0.015668369829654694, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 222.390625, "completions/mean_terminated_length": 222.390625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3677259683609009, "epoch": 2.047794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.0241763121119458, "kl": 0.11141422390937805, "learning_rate": 2.7889558786495455e-07, "loss": -0.0033, "num_tokens": 52727481.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7051466703414917, "sampling/importance_sampling_ratio/mean": 0.9996929168701172, "sampling/importance_sampling_ratio/min": 0.5259426236152649, "sampling/sampling_logp_difference/max": 0.6425632238388062, "sampling/sampling_logp_difference/mean": 0.017417607828974724, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 224.5625, "completions/mean_terminated_length": 224.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22942262887954712, "epoch": 2.049019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.1211021324738104, "kl": 0.07600408792495728, "learning_rate": 2.782568708770933e-07, "loss": 0.0435, "num_tokens": 52759661.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6190972328186035, "sampling/importance_sampling_ratio/mean": 1.000002145767212, "sampling/importance_sampling_ratio/min": 0.5999808311462402, "sampling/sampling_logp_difference/max": 0.5108575820922852, "sampling/sampling_logp_difference/mean": 0.012174902483820915, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 230.015625, "completions/mean_terminated_length": 230.015625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.21251729130744934, "epoch": 2.0502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.2681871459692562, "kl": 0.07802058756351471, "learning_rate": 2.7761860401373627e-07, "loss": -0.0245, "num_tokens": 52791774.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000063419342041, "sampling/importance_sampling_ratio/min": 0.3121943771839142, "sampling/sampling_logp_difference/max": 1.1641292572021484, "sampling/sampling_logp_difference/mean": 0.012759133242070675, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 289.390625, "completions/mean_terminated_length": 289.390625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.24743464589118958, "epoch": 2.051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.958409340880211, "kl": 0.07729962468147278, "learning_rate": 2.7698078857052474e-07, "loss": 0.0083, "num_tokens": 52824631.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5117584466934204, "sampling/importance_sampling_ratio/mean": 1.0005438327789307, "sampling/importance_sampling_ratio/min": 0.5339420437812805, "sampling/sampling_logp_difference/max": 0.627467930316925, "sampling/sampling_logp_difference/mean": 0.012405122630298138, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 225.96875, "completions/mean_terminated_length": 225.96875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.22162757813930511, "epoch": 2.0526960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.4250513679668029, "kl": 0.07321614772081375, "learning_rate": 2.763434258421836e-07, "loss": -0.0132, "num_tokens": 52856869.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0009355545043945, "sampling/importance_sampling_ratio/min": 0.5484393835067749, "sampling/sampling_logp_difference/max": 1.0633785724639893, "sampling/sampling_logp_difference/mean": 0.012057576328516006, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 217.609375, "completions/mean_terminated_length": 217.609375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.21669289469718933, "epoch": 2.053921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0402609594328909, "kl": 0.0873531922698021, "learning_rate": 2.757065171225192e-07, "loss": 0.0008, "num_tokens": 52885676.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7586616277694702, "sampling/importance_sampling_ratio/mean": 1.0000011920928955, "sampling/importance_sampling_ratio/min": 0.4062422215938568, "sampling/sampling_logp_difference/max": 0.9008057117462158, "sampling/sampling_logp_difference/mean": 0.012449707835912704, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 233.3125, "completions/mean_terminated_length": 233.3125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.24843087792396545, "epoch": 2.0551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.216347259289768, "kl": 0.08403472602367401, "learning_rate": 2.750700637044155e-07, "loss": -0.1293, "num_tokens": 52916912.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996212720870972, "sampling/importance_sampling_ratio/min": 0.5261399745941162, "sampling/sampling_logp_difference/max": 0.7840127944946289, "sampling/sampling_logp_difference/mean": 0.013490501791238785, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 284.015625, "completions/mean_terminated_length": 284.015625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2866605222225189, "epoch": 2.0563725490196076, "frac_reward_zero_std": 0.75, "grad_norm": 1.0521726104314784, "kl": 0.061432041227817535, "learning_rate": 2.7443406687983264e-07, "loss": 0.0113, "num_tokens": 52955777.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.528875708580017, "sampling/importance_sampling_ratio/mean": 0.9994683265686035, "sampling/importance_sampling_ratio/min": 0.5874800086021423, "sampling/sampling_logp_difference/max": 0.5319130420684814, "sampling/sampling_logp_difference/mean": 0.014019916765391827, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.29307612776756287, "epoch": 2.0575980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.4559984618208732, "kl": 0.09767642617225647, "learning_rate": 2.7379852793980416e-07, "loss": -0.0363, "num_tokens": 52985865.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5886762142181396, "sampling/importance_sampling_ratio/mean": 0.9998399615287781, "sampling/importance_sampling_ratio/min": 0.6176496744155884, "sampling/sampling_logp_difference/max": 0.48183393478393555, "sampling/sampling_logp_difference/mean": 0.013554751873016357, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3286081552505493, "epoch": 2.0588235294117645, "frac_reward_zero_std": 0.25, "grad_norm": 1.605307084978014, "kl": 0.09625139832496643, "learning_rate": 2.7316344817443363e-07, "loss": -0.0145, "num_tokens": 53023153.0, "reward": 0.15625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.8163264989852905, "sampling/importance_sampling_ratio/mean": 0.999686062335968, "sampling/importance_sampling_ratio/min": 0.6147370934486389, "sampling/sampling_logp_difference/max": 0.5968160629272461, "sampling/sampling_logp_difference/mean": 0.015223829075694084, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 218.40625, "completions/mean_terminated_length": 218.40625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2261694371700287, "epoch": 2.060049019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.0154322061573835, "kl": 0.09540575742721558, "learning_rate": 2.7252882887289287e-07, "loss": 0.0049, "num_tokens": 53053099.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5433379411697388, "sampling/importance_sampling_ratio/mean": 1.000070571899414, "sampling/importance_sampling_ratio/min": 0.5636303424835205, "sampling/sampling_logp_difference/max": 0.5733566284179688, "sampling/sampling_logp_difference/mean": 0.011534119956195354, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2440802901983261, "epoch": 2.0612745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.06137228801321598, "kl": 0.08271975815296173, "learning_rate": 2.718946713234185e-07, "loss": 0.0008, "num_tokens": 53079223.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.570361852645874, "sampling/importance_sampling_ratio/mean": 0.9998970627784729, "sampling/importance_sampling_ratio/min": 0.6176997423171997, "sampling/sampling_logp_difference/max": 0.481752872467041, "sampling/sampling_logp_difference/mean": 0.013783842325210571, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.2853248119354248, "epoch": 2.0625, "frac_reward_zero_std": 0.75, "grad_norm": 0.8566245712168283, "kl": 0.09553046524524689, "learning_rate": 2.712609768133106e-07, "loss": 0.0116, "num_tokens": 53120495.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6980427503585815, "sampling/importance_sampling_ratio/mean": 0.999942421913147, "sampling/importance_sampling_ratio/min": 0.48238492012023926, "sampling/sampling_logp_difference/max": 0.7290129661560059, "sampling/sampling_logp_difference/mean": 0.014373978599905968, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 225.390625, "completions/mean_terminated_length": 225.390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.28901052474975586, "epoch": 2.063725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.9639789878845415, "kl": 0.08507464826107025, "learning_rate": 2.7062774662892886e-07, "loss": -0.0054, "num_tokens": 53156264.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6519145965576172, "sampling/importance_sampling_ratio/mean": 1.0008249282836914, "sampling/importance_sampling_ratio/min": 0.4598377048969269, "sampling/sampling_logp_difference/max": 0.7768816947937012, "sampling/sampling_logp_difference/mean": 0.014761666767299175, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 267.40625, "completions/mean_terminated_length": 267.40625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.2973840534687042, "epoch": 2.064950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.0715930586265272, "kl": 0.12841664254665375, "learning_rate": 2.6999498205569e-07, "loss": 0.04, "num_tokens": 53191410.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9099645614624023, "sampling/importance_sampling_ratio/mean": 0.9996175169944763, "sampling/importance_sampling_ratio/min": 0.480032742023468, "sampling/sampling_logp_difference/max": 0.7339010238647461, "sampling/sampling_logp_difference/mean": 0.016689680516719818, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 244.359375, "completions/mean_terminated_length": 244.359375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.28265005350112915, "epoch": 2.0661764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.889216893984033, "kl": 0.10497517883777618, "learning_rate": 2.693626843780665e-07, "loss": -0.0013, "num_tokens": 53223289.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6166887283325195, "sampling/importance_sampling_ratio/mean": 1.000554084777832, "sampling/importance_sampling_ratio/min": 0.6457998752593994, "sampling/sampling_logp_difference/max": 0.4803800582885742, "sampling/sampling_logp_difference/mean": 0.013796938583254814, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 206.015625, "completions/mean_terminated_length": 206.015625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2224217653274536, "epoch": 2.0674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.05156404446184419, "kl": 0.0919741690158844, "learning_rate": 2.687308548795825e-07, "loss": 0.0009, "num_tokens": 53256058.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6253172159194946, "sampling/importance_sampling_ratio/mean": 1.00070059299469, "sampling/importance_sampling_ratio/min": 0.3588513731956482, "sampling/sampling_logp_difference/max": 1.0248470306396484, "sampling/sampling_logp_difference/mean": 0.013268515467643738, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 215.71875, "completions/mean_terminated_length": 215.71875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.28397154808044434, "epoch": 2.0686274509803924, "frac_reward_zero_std": 0.5, "grad_norm": 1.4321573737376183, "kl": 0.10913024842739105, "learning_rate": 2.6809949484281164e-07, "loss": -0.0355, "num_tokens": 53296264.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9980440139770508, "sampling/importance_sampling_ratio/mean": 0.9999949932098389, "sampling/importance_sampling_ratio/min": 0.36637455224990845, "sampling/sampling_logp_difference/max": 1.0040991306304932, "sampling/sampling_logp_difference/mean": 0.016209449619054794, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 182.953125, "completions/mean_terminated_length": 182.953125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.22761885821819305, "epoch": 2.0698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.2727016130142637, "kl": 0.11536600440740585, "learning_rate": 2.674686055493748e-07, "loss": 0.0002, "num_tokens": 53325733.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.573062539100647, "sampling/importance_sampling_ratio/mean": 1.0001211166381836, "sampling/importance_sampling_ratio/min": 0.5370011925697327, "sampling/sampling_logp_difference/max": 0.6217550039291382, "sampling/sampling_logp_difference/mean": 0.013285119086503983, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 197.921875, "completions/mean_terminated_length": 197.921875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2960512042045593, "epoch": 2.071078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.1194285862158084, "kl": 0.10327520221471786, "learning_rate": 2.668381882799375e-07, "loss": -0.0157, "num_tokens": 53356320.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7529429197311401, "sampling/importance_sampling_ratio/mean": 0.9996784925460815, "sampling/importance_sampling_ratio/min": 0.5263916254043579, "sampling/sampling_logp_difference/max": 0.6417098045349121, "sampling/sampling_logp_difference/mean": 0.01602327823638916, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 197.640625, "completions/mean_terminated_length": 197.640625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2522090673446655, "epoch": 2.0723039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.3949340716952856, "kl": 0.0917755737900734, "learning_rate": 2.662082443142068e-07, "loss": 0.0016, "num_tokens": 53385241.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5211591720581055, "sampling/importance_sampling_ratio/mean": 1.0000898838043213, "sampling/importance_sampling_ratio/min": 0.5760922431945801, "sampling/sampling_logp_difference/max": 0.5514874458312988, "sampling/sampling_logp_difference/mean": 0.01480923593044281, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 241.140625, "completions/mean_terminated_length": 241.140625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2808194160461426, "epoch": 2.073529411764706, "frac_reward_zero_std": 0.25, "grad_norm": 1.8278898187120234, "kl": 0.09493476897478104, "learning_rate": 2.6557877493092883e-07, "loss": 0.0219, "num_tokens": 53417746.0, "reward": 0.3125, "reward_std": 0.6645200252532959, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6169688701629639, "sampling/importance_sampling_ratio/mean": 0.9999876022338867, "sampling/importance_sampling_ratio/min": 0.601413905620575, "sampling/sampling_logp_difference/max": 0.5084719657897949, "sampling/sampling_logp_difference/mean": 0.01424361951649189, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 245.359375, "completions/mean_terminated_length": 245.359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2270507961511612, "epoch": 2.0747549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.2972264315716464, "kl": 0.07362878322601318, "learning_rate": 2.6494978140788686e-07, "loss": 0.0472, "num_tokens": 53449753.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6605846881866455, "sampling/importance_sampling_ratio/mean": 1.000180959701538, "sampling/importance_sampling_ratio/min": 0.6133704781532288, "sampling/sampling_logp_difference/max": 0.5071697235107422, "sampling/sampling_logp_difference/mean": 0.01344931311905384, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 228.703125, "completions/mean_terminated_length": 228.703125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.27688461542129517, "epoch": 2.075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.2010355458651496, "kl": 0.08339104056358337, "learning_rate": 2.643212650218976e-07, "loss": 0.0019, "num_tokens": 53482918.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.589503288269043, "sampling/importance_sampling_ratio/mean": 0.9999773502349854, "sampling/importance_sampling_ratio/min": 0.4946943521499634, "sampling/sampling_logp_difference/max": 0.703815221786499, "sampling/sampling_logp_difference/mean": 0.014928510412573814, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 238.765625, "completions/mean_terminated_length": 238.765625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2972226142883301, "epoch": 2.077205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 2.024096463484243, "kl": 0.10847216844558716, "learning_rate": 2.6369322704881e-07, "loss": 0.0235, "num_tokens": 53519671.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6023660898208618, "sampling/importance_sampling_ratio/mean": 1.0001966953277588, "sampling/importance_sampling_ratio/min": 0.405647337436676, "sampling/sampling_logp_difference/max": 0.9022711515426636, "sampling/sampling_logp_difference/mean": 0.01701277121901512, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.23059415817260742, "epoch": 2.0784313725490198, "frac_reward_zero_std": 0.25, "grad_norm": 1.3897163961565249, "kl": 0.08193601667881012, "learning_rate": 2.6306566876350067e-07, "loss": 0.0239, "num_tokens": 53562047.0, "reward": 0.84375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6016916036605835, "sampling/importance_sampling_ratio/mean": 1.0005524158477783, "sampling/importance_sampling_ratio/min": 0.6175822019577026, "sampling/sampling_logp_difference/max": 0.48194313049316406, "sampling/sampling_logp_difference/mean": 0.01181773655116558, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 161.578125, "completions/mean_terminated_length": 161.578125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.19032913446426392, "epoch": 2.079656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.054749233067084666, "kl": 0.08211810141801834, "learning_rate": 2.6243859143987367e-07, "loss": 0.0008, "num_tokens": 53585268.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7697570323944092, "sampling/importance_sampling_ratio/mean": 1.000309944152832, "sampling/importance_sampling_ratio/min": 0.07049079239368439, "sampling/sampling_logp_difference/max": 2.652273178100586, "sampling/sampling_logp_difference/mean": 0.012656516395509243, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 254.015625, "completions/mean_terminated_length": 254.015625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.27198344469070435, "epoch": 2.0808823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.0004974301028033, "kl": 0.08722476661205292, "learning_rate": 2.6181199635085616e-07, "loss": 0.0209, "num_tokens": 53616421.0, "reward": 0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6436246633529663, "sampling/importance_sampling_ratio/mean": 0.9999319314956665, "sampling/importance_sampling_ratio/min": 0.4887487292289734, "sampling/sampling_logp_difference/max": 0.7159067392349243, "sampling/sampling_logp_difference/mean": 0.01370446290820837, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 277.28125, "completions/mean_terminated_length": 277.28125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2722983956336975, "epoch": 2.082107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.1210881745154528, "kl": 0.055352453142404556, "learning_rate": 2.6118588476839607e-07, "loss": -0.0103, "num_tokens": 53651767.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.971760630607605, "sampling/importance_sampling_ratio/mean": 0.9999873042106628, "sampling/importance_sampling_ratio/min": 0.4567083418369293, "sampling/sampling_logp_difference/max": 0.7837103605270386, "sampling/sampling_logp_difference/mean": 0.014415323734283447, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 171.40625, "completions/mean_terminated_length": 171.40625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.20028963685035706, "epoch": 2.0833333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.5168636163216445, "kl": 0.06290790438652039, "learning_rate": 2.6056025796346094e-07, "loss": 0.0298, "num_tokens": 53679313.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7804113626480103, "sampling/importance_sampling_ratio/mean": 0.9998703002929688, "sampling/importance_sampling_ratio/min": 0.46356526017189026, "sampling/sampling_logp_difference/max": 0.768808126449585, "sampling/sampling_logp_difference/mean": 0.012557139620184898, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 224.484375, "completions/mean_terminated_length": 224.484375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.26865464448928833, "epoch": 2.0845588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.750853709796323, "kl": 0.09681269526481628, "learning_rate": 2.599351172060329e-07, "loss": 0.0792, "num_tokens": 53711472.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6248468160629272, "sampling/importance_sampling_ratio/mean": 0.9993053078651428, "sampling/importance_sampling_ratio/min": 0.6057427525520325, "sampling/sampling_logp_difference/max": 0.5012998580932617, "sampling/sampling_logp_difference/mean": 0.014584803953766823, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 219.296875, "completions/mean_terminated_length": 219.296875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2701049745082855, "epoch": 2.0857843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 1.7294079164994691, "kl": 0.09864313900470734, "learning_rate": 2.593104637651087e-07, "loss": 0.0051, "num_tokens": 53745267.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6277406215667725, "sampling/importance_sampling_ratio/mean": 0.9999047517776489, "sampling/importance_sampling_ratio/min": 0.4596964120864868, "sampling/sampling_logp_difference/max": 0.7771890163421631, "sampling/sampling_logp_difference/mean": 0.013745477423071861, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 222.78125, "completions/mean_terminated_length": 222.78125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.313871294260025, "epoch": 2.0870098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.2424166278195006, "kl": 0.08946055173873901, "learning_rate": 2.5868629890869463e-07, "loss": -0.0132, "num_tokens": 53777061.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995561838150024, "sampling/importance_sampling_ratio/min": 0.5826087594032288, "sampling/sampling_logp_difference/max": 0.7213950157165527, "sampling/sampling_logp_difference/mean": 0.015899470075964928, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 241.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.1832994967699051, "epoch": 2.088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.9990023865572616, "kl": 0.07222867012023926, "learning_rate": 2.580626239038061e-07, "loss": 0.0564, "num_tokens": 53809813.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004302263259888, "sampling/importance_sampling_ratio/min": 0.6222401857376099, "sampling/sampling_logp_difference/max": 0.7347879409790039, "sampling/sampling_logp_difference/mean": 0.010797273367643356, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 280.90625, "completions/mean_terminated_length": 280.90625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.32643812894821167, "epoch": 2.0894607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.5066735371030042, "kl": 0.0860418826341629, "learning_rate": 2.5743944001646387e-07, "loss": -0.1456, "num_tokens": 53848527.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000722885131836, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.7048373222351074, "sampling/sampling_logp_difference/mean": 0.014995020814239979, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.28493088483810425, "epoch": 2.090686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 2.007547341251892, "kl": 0.10750987380743027, "learning_rate": 2.568167485116919e-07, "loss": -0.0033, "num_tokens": 53881191.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8167446851730347, "sampling/importance_sampling_ratio/mean": 1.0001343488693237, "sampling/importance_sampling_ratio/min": 0.5686253905296326, "sampling/sampling_logp_difference/max": 0.5970462560653687, "sampling/sampling_logp_difference/mean": 0.015858035534620285, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 193.890625, "completions/mean_terminated_length": 193.890625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.28339624404907227, "epoch": 2.0919117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.5254505440260446, "kl": 0.1307823657989502, "learning_rate": 2.5619455065351435e-07, "loss": -0.1184, "num_tokens": 53913744.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8837978839874268, "sampling/importance_sampling_ratio/mean": 0.9992457628250122, "sampling/importance_sampling_ratio/min": 0.6172720193862915, "sampling/sampling_logp_difference/max": 0.6332898139953613, "sampling/sampling_logp_difference/mean": 0.01515759713947773, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 184.515625, "completions/mean_terminated_length": 184.515625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.16468603909015656, "epoch": 2.093137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.18154179047978547, "kl": 0.09442128241062164, "learning_rate": 2.555728477049532e-07, "loss": 0.0009, "num_tokens": 53940977.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5411758422851562, "sampling/importance_sampling_ratio/mean": 1.0002518892288208, "sampling/importance_sampling_ratio/min": 0.591300368309021, "sampling/sampling_logp_difference/max": 0.5254311561584473, "sampling/sampling_logp_difference/mean": 0.010360531508922577, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 199.453125, "completions/mean_terminated_length": 199.453125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.20893830060958862, "epoch": 2.094362745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.6978978345600726, "kl": 0.11723177134990692, "learning_rate": 2.5495164092802646e-07, "loss": -0.1173, "num_tokens": 53973150.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.969475507736206, "sampling/importance_sampling_ratio/mean": 0.999012291431427, "sampling/importance_sampling_ratio/min": 0.02537323720753193, "sampling/sampling_logp_difference/max": 3.674060344696045, "sampling/sampling_logp_difference/mean": 0.01393135730177164, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2282354086637497, "epoch": 2.0955882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.4792067205301653, "kl": 0.07126940041780472, "learning_rate": 2.5433093158374437e-07, "loss": -0.059, "num_tokens": 54005422.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.690609335899353, "sampling/importance_sampling_ratio/mean": 0.9998007416725159, "sampling/importance_sampling_ratio/min": 0.609043538570404, "sampling/sampling_logp_difference/max": 0.5250890254974365, "sampling/sampling_logp_difference/mean": 0.012127692811191082, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 294.09375, "completions/mean_terminated_length": 294.09375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.268255352973938, "epoch": 2.096813725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.564716344849188, "kl": 0.07376987487077713, "learning_rate": 2.537107209321074e-07, "loss": -0.0202, "num_tokens": 54044004.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6540254354476929, "sampling/importance_sampling_ratio/mean": 1.0002803802490234, "sampling/importance_sampling_ratio/min": 0.5292773246765137, "sampling/sampling_logp_difference/max": 0.6362427473068237, "sampling/sampling_logp_difference/mean": 0.013309784233570099, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 195.9365234375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.24611814320087433, "epoch": 2.0980392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 1.6217309517852607, "kl": 0.11633831262588501, "learning_rate": 2.5309101023210424e-07, "loss": 1.0697, "num_tokens": 54077044.0, "reward": 0.625, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6280970573425293, "sampling/importance_sampling_ratio/mean": 0.9997564554214478, "sampling/importance_sampling_ratio/min": 0.5820223689079285, "sampling/sampling_logp_difference/max": 0.5412464141845703, "sampling/sampling_logp_difference/mean": 0.012156832963228226, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2816395163536072, "epoch": 2.099264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 2.0319645059625984, "kl": 0.07405499368906021, "learning_rate": 2.524718007417081e-07, "loss": 0.0138, "num_tokens": 54109964.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002710819244385, "sampling/importance_sampling_ratio/min": 0.5820378065109253, "sampling/sampling_logp_difference/max": 0.7261428833007812, "sampling/sampling_logp_difference/mean": 0.014796335250139236, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 251.6875, "completions/mean_terminated_length": 251.6875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.27752459049224854, "epoch": 2.1004901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.4228132897805845, "kl": 0.08228473365306854, "learning_rate": 2.518530937178751e-07, "loss": -0.0115, "num_tokens": 54147112.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6428769826889038, "sampling/importance_sampling_ratio/mean": 0.9996975660324097, "sampling/importance_sampling_ratio/min": 0.524318516254425, "sampling/sampling_logp_difference/max": 0.6456558704376221, "sampling/sampling_logp_difference/mean": 0.013332992792129517, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 206.390625, "completions/mean_terminated_length": 206.390625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3341542184352875, "epoch": 2.1017156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.05268439635242992, "kl": 0.10409331321716309, "learning_rate": 2.512348904165411e-07, "loss": 0.001, "num_tokens": 54178337.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6648049354553223, "sampling/importance_sampling_ratio/mean": 0.9995940923690796, "sampling/importance_sampling_ratio/min": 0.4500335156917572, "sampling/sampling_logp_difference/max": 0.7984331846237183, "sampling/sampling_logp_difference/mean": 0.015868140384554863, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 183.515625, "completions/mean_terminated_length": 183.515625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.18172964453697205, "epoch": 2.1029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.38210724360668885, "kl": 0.06356281787157059, "learning_rate": 2.5061719209262e-07, "loss": 0.0006, "num_tokens": 54204898.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4399677515029907, "sampling/importance_sampling_ratio/mean": 1.0001122951507568, "sampling/importance_sampling_ratio/min": 0.09123151749372482, "sampling/sampling_logp_difference/max": 2.394354820251465, "sampling/sampling_logp_difference/mean": 0.011870104819536209, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 199.796875, "completions/mean_terminated_length": 199.796875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2558775544166565, "epoch": 2.1041666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 1.0363360716687429, "kl": 0.08049780875444412, "learning_rate": 2.500000000000001e-07, "loss": -0.0034, "num_tokens": 54237365.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6464908123016357, "sampling/importance_sampling_ratio/mean": 1.0003085136413574, "sampling/importance_sampling_ratio/min": 0.5331515073776245, "sampling/sampling_logp_difference/max": 0.6289496421813965, "sampling/sampling_logp_difference/mean": 0.015734516084194183, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 229.03125, "completions/mean_terminated_length": 229.03125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.1508364975452423, "epoch": 2.105392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.03236801132415331, "kl": 0.05850634723901749, "learning_rate": 2.49383315391542e-07, "loss": 0.0005, "num_tokens": 54265927.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6401046514511108, "sampling/importance_sampling_ratio/mean": 0.9999222159385681, "sampling/importance_sampling_ratio/min": 0.6105112433433533, "sampling/sampling_logp_difference/max": 0.49476003646850586, "sampling/sampling_logp_difference/mean": 0.010126762092113495, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 183.609375, "completions/mean_terminated_length": 183.609375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.20529486238956451, "epoch": 2.1066176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.9023498920204884, "kl": 0.08519157767295837, "learning_rate": 2.4876713951907685e-07, "loss": -0.0059, "num_tokens": 54293726.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994800686836243, "sampling/importance_sampling_ratio/min": 0.5265898108482361, "sampling/sampling_logp_difference/max": 0.7503294944763184, "sampling/sampling_logp_difference/mean": 0.0120610436424613, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 247.140625, "completions/mean_terminated_length": 247.140625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.22897523641586304, "epoch": 2.107843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.3870651483158327, "kl": 0.1443021297454834, "learning_rate": 2.481514736334022e-07, "loss": 0.0068, "num_tokens": 54323863.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004396438598633, "sampling/importance_sampling_ratio/min": 0.608461320400238, "sampling/sampling_logp_difference/max": 0.7307519912719727, "sampling/sampling_logp_difference/mean": 0.011523693799972534, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 226.140625, "completions/mean_terminated_length": 226.140625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.26521825790405273, "epoch": 2.1090686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.1330318041901481, "kl": 0.08887003362178802, "learning_rate": 2.4753631898428134e-07, "loss": 0.0009, "num_tokens": 54358816.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6283246278762817, "sampling/importance_sampling_ratio/mean": 0.999549925327301, "sampling/importance_sampling_ratio/min": 0.5542429089546204, "sampling/sampling_logp_difference/max": 0.5901522636413574, "sampling/sampling_logp_difference/mean": 0.014785770326852798, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 241.796875, "completions/mean_terminated_length": 241.796875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3558571934700012, "epoch": 2.110294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.6903484152336, "kl": 0.1265546679496765, "learning_rate": 2.4692167682043853e-07, "loss": -0.0268, "num_tokens": 54404451.0, "reward": -0.21875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.9876798391342163, "sampling/importance_sampling_ratio/mean": 1.0000967979431152, "sampling/importance_sampling_ratio/min": 0.24371837079524994, "sampling/sampling_logp_difference/max": 1.4117419719696045, "sampling/sampling_logp_difference/mean": 0.017827820032835007, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 256.8125, "completions/mean_terminated_length": 256.8125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3483244478702545, "epoch": 2.111519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.9842230917924334, "kl": 0.1271551251411438, "learning_rate": 2.4630754838955896e-07, "loss": -0.0121, "num_tokens": 54437655.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999490976333618, "sampling/importance_sampling_ratio/min": 0.5489413738250732, "sampling/sampling_logp_difference/max": 0.771472692489624, "sampling/sampling_logp_difference/mean": 0.017512347549200058, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 216.328125, "completions/mean_terminated_length": 216.328125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.30079299211502075, "epoch": 2.1127450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.027835491538995, "kl": 0.1183740645647049, "learning_rate": 2.456939349382843e-07, "loss": -0.0364, "num_tokens": 54470316.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994838833808899, "sampling/importance_sampling_ratio/min": 0.3653780519962311, "sampling/sampling_logp_difference/max": 1.0068227052688599, "sampling/sampling_logp_difference/mean": 0.015461128205060959, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.22464868426322937, "epoch": 2.113970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0455863379671696, "kl": 0.08102913200855255, "learning_rate": 2.450808377122107e-07, "loss": 0.0008, "num_tokens": 54497180.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005425214767456, "sampling/importance_sampling_ratio/min": 0.6485685110092163, "sampling/sampling_logp_difference/max": 0.7369003295898438, "sampling/sampling_logp_difference/mean": 0.012720860540866852, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 178.6875, "completions/mean_terminated_length": 178.6875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.19968390464782715, "epoch": 2.1151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0752407669557944, "kl": 0.0696045309305191, "learning_rate": 2.4446825795588716e-07, "loss": 0.0007, "num_tokens": 54528104.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5081994533538818, "sampling/importance_sampling_ratio/mean": 1.0000910758972168, "sampling/importance_sampling_ratio/min": 0.6319701671600342, "sampling/sampling_logp_difference/max": 0.45891308784484863, "sampling/sampling_logp_difference/mean": 0.011953980661928654, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 230.3125, "completions/mean_terminated_length": 230.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.21571284532546997, "epoch": 2.116421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.1012846289450315, "kl": 0.06651733815670013, "learning_rate": 2.438561969128114e-07, "loss": 0.0242, "num_tokens": 54562956.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6443967819213867, "sampling/importance_sampling_ratio/mean": 1.0001633167266846, "sampling/importance_sampling_ratio/min": 0.44692784547805786, "sampling/sampling_logp_difference/max": 0.8053581714630127, "sampling/sampling_logp_difference/mean": 0.011286087334156036, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 164.6875, "completions/mean_terminated_length": 164.6875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.24942715466022491, "epoch": 2.1176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.5723757822993916, "kl": 0.09061464667320251, "learning_rate": 2.43244655825429e-07, "loss": -0.0274, "num_tokens": 54587672.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.609130859375, "sampling/importance_sampling_ratio/mean": 0.9997722506523132, "sampling/importance_sampling_ratio/min": 0.5053581595420837, "sampling/sampling_logp_difference/max": 0.682487964630127, "sampling/sampling_logp_difference/mean": 0.013893929310142994, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 173.90625, "completions/mean_terminated_length": 173.90625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.15339696407318115, "epoch": 2.1188725490196076, "frac_reward_zero_std": 1.0, "grad_norm": 0.04066181976972253, "kl": 0.04262594133615494, "learning_rate": 2.4263363593512903e-07, "loss": 0.0004, "num_tokens": 54613538.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000312328338623, "sampling/importance_sampling_ratio/min": 0.4290059804916382, "sampling/sampling_logp_difference/max": 0.8462843894958496, "sampling/sampling_logp_difference/mean": 0.009953014552593231, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 161.15625, "completions/mean_terminated_length": 161.15625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.22494977712631226, "epoch": 2.1200980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.0627337312082299, "kl": 0.1084507554769516, "learning_rate": 2.4202313848224364e-07, "loss": 0.0011, "num_tokens": 54641212.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6083506345748901, "sampling/importance_sampling_ratio/mean": 1.0007972717285156, "sampling/importance_sampling_ratio/min": 0.6186642050743103, "sampling/sampling_logp_difference/max": 0.4801926612854004, "sampling/sampling_logp_difference/mean": 0.013805609196424484, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 271.921875, "completions/mean_terminated_length": 271.921875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3591475486755371, "epoch": 2.1213235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.9981361482983514, "kl": 0.09604911506175995, "learning_rate": 2.414131647060436e-07, "loss": 0.0004, "num_tokens": 54684295.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7665029764175415, "sampling/importance_sampling_ratio/mean": 0.9997198581695557, "sampling/importance_sampling_ratio/min": 0.47804930806159973, "sampling/sampling_logp_difference/max": 0.7380414009094238, "sampling/sampling_logp_difference/mean": 0.0156893078237772, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 178.03125, "completions/mean_terminated_length": 178.03125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.17584030330181122, "epoch": 2.122549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.04942371544216179, "kl": 0.0640132874250412, "learning_rate": 2.4080371584473745e-07, "loss": 0.0007, "num_tokens": 54711561.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6530674695968628, "sampling/importance_sampling_ratio/mean": 0.9999580979347229, "sampling/importance_sampling_ratio/min": 0.5910108685493469, "sampling/sampling_logp_difference/max": 0.5259209275245667, "sampling/sampling_logp_difference/mean": 0.013702481985092163, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 263.546875, "completions/mean_terminated_length": 263.546875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.23605573177337646, "epoch": 2.123774509803922, "frac_reward_zero_std": 0.25, "grad_norm": 1.5635733484446757, "kl": 0.07343361526727676, "learning_rate": 2.4019479313546757e-07, "loss": -0.0533, "num_tokens": 54752972.0, "reward": -0.125, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9693632125854492, "sampling/importance_sampling_ratio/mean": 0.9998065233230591, "sampling/importance_sampling_ratio/min": 0.477456659078598, "sampling/sampling_logp_difference/max": 0.7392818927764893, "sampling/sampling_logp_difference/mean": 0.01350579783320427, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 200.71875, "completions/mean_terminated_length": 200.71875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.35887908935546875, "epoch": 2.125, "frac_reward_zero_std": 0.25, "grad_norm": 2.0794125377573662, "kl": 0.13092204928398132, "learning_rate": 2.395863978143083e-07, "loss": -0.0382, "num_tokens": 54791514.0, "reward": -0.0625, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7535253763198853, "sampling/importance_sampling_ratio/mean": 0.9996278285980225, "sampling/importance_sampling_ratio/min": 0.6149638891220093, "sampling/sampling_logp_difference/max": 0.5616282224655151, "sampling/sampling_logp_difference/mean": 0.01837782934308052, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 278.703125, "completions/mean_terminated_length": 278.703125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.22424054145812988, "epoch": 2.126225490196078, "frac_reward_zero_std": 0.5, "grad_norm": 1.4767677478882237, "kl": 0.06706281751394272, "learning_rate": 2.3897853111626417e-07, "loss": -0.0496, "num_tokens": 54829255.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.598141074180603, "sampling/importance_sampling_ratio/mean": 1.000441312789917, "sampling/importance_sampling_ratio/min": 0.3738076984882355, "sampling/sampling_logp_difference/max": 0.9840137958526611, "sampling/sampling_logp_difference/mean": 0.013220787979662418, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 163.3125, "completions/mean_terminated_length": 163.3125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.24036374688148499, "epoch": 2.127450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.2965382984415355, "kl": 0.08836943656206131, "learning_rate": 2.383711942752652e-07, "loss": -0.0056, "num_tokens": 54858171.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5497233867645264, "sampling/importance_sampling_ratio/mean": 0.9995202422142029, "sampling/importance_sampling_ratio/min": 0.4811636507511139, "sampling/sampling_logp_difference/max": 0.7315478324890137, "sampling/sampling_logp_difference/mean": 0.014389409683644772, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 215.796875, "completions/mean_terminated_length": 215.796875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.32177525758743286, "epoch": 2.1286764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.5956116910340583, "kl": 0.09418761730194092, "learning_rate": 2.377643885241674e-07, "loss": 0.0455, "num_tokens": 54895390.0, "reward": 0.46875, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5071581602096558, "sampling/importance_sampling_ratio/mean": 1.0002448558807373, "sampling/importance_sampling_ratio/min": 0.5393481254577637, "sampling/sampling_logp_difference/max": 0.617393970489502, "sampling/sampling_logp_difference/mean": 0.015557506121695042, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 201.078125, "completions/mean_terminated_length": 201.078125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3626950979232788, "epoch": 2.1299019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.8998408094699397, "kl": 0.09067320823669434, "learning_rate": 2.371581150947476e-07, "loss": 0.0716, "num_tokens": 54926707.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7580565214157104, "sampling/importance_sampling_ratio/mean": 0.99964439868927, "sampling/importance_sampling_ratio/min": 0.5687322020530701, "sampling/sampling_logp_difference/max": 0.5643455982208252, "sampling/sampling_logp_difference/mean": 0.01802055910229683, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 256.921875, "completions/mean_terminated_length": 256.921875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.24795296788215637, "epoch": 2.1311274509803924, "frac_reward_zero_std": 0.75, "grad_norm": 0.9659707270123654, "kl": 0.08193226158618927, "learning_rate": 2.3655237521770282e-07, "loss": -0.0085, "num_tokens": 54963086.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5624288320541382, "sampling/importance_sampling_ratio/mean": 0.9994508624076843, "sampling/importance_sampling_ratio/min": 0.3866180181503296, "sampling/sampling_logp_difference/max": 0.9503180980682373, "sampling/sampling_logp_difference/mean": 0.012907266616821289, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 193.703125, "completions/mean_terminated_length": 193.703125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2887061536312103, "epoch": 2.1323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.4869115641371657, "kl": 0.10755470395088196, "learning_rate": 2.3594717012264642e-07, "loss": -0.0437, "num_tokens": 54995083.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.650195598602295, "sampling/importance_sampling_ratio/mean": 0.9998593926429749, "sampling/importance_sampling_ratio/min": 0.620280385017395, "sampling/sampling_logp_difference/max": 0.5008938312530518, "sampling/sampling_logp_difference/mean": 0.015658602118492126, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 243.109375, "completions/mean_terminated_length": 243.109375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3275303840637207, "epoch": 2.133578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.712720509641171, "kl": 0.11412831395864487, "learning_rate": 2.3534250103810627e-07, "loss": 0.0513, "num_tokens": 55029090.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6466587781906128, "sampling/importance_sampling_ratio/mean": 0.9996645450592041, "sampling/importance_sampling_ratio/min": 0.37343189120292664, "sampling/sampling_logp_difference/max": 0.9850196838378906, "sampling/sampling_logp_difference/mean": 0.015656957402825356, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 278.390625, "completions/mean_terminated_length": 278.390625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3630286455154419, "epoch": 2.1348039215686274, "frac_reward_zero_std": 0.0, "grad_norm": 1.8417698342017348, "kl": 0.11501502245664597, "learning_rate": 2.3473836919152263e-07, "loss": 0.1042, "num_tokens": 55067387.0, "reward": 0.46875, "reward_std": 0.8723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.592509150505066, "sampling/importance_sampling_ratio/mean": 1.0000813007354736, "sampling/importance_sampling_ratio/min": 0.4159523844718933, "sampling/sampling_logp_difference/max": 0.8771845102310181, "sampling/sampling_logp_difference/mean": 0.01706148311495781, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 215.609375, "completions/mean_terminated_length": 215.609375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.28509455919265747, "epoch": 2.136029411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.79398630714614, "kl": 0.11121399700641632, "learning_rate": 2.3413477580924475e-07, "loss": 0.074, "num_tokens": 55098930.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8181973695755005, "sampling/importance_sampling_ratio/mean": 0.999822199344635, "sampling/importance_sampling_ratio/min": 0.3234425485134125, "sampling/sampling_logp_difference/max": 1.12873375415802, "sampling/sampling_logp_difference/mean": 0.014676484279334545, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 226.65625, "completions/mean_terminated_length": 226.65625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.20623424649238586, "epoch": 2.1372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.04135339761860059, "kl": 0.0599723644554615, "learning_rate": 2.3353172211652884e-07, "loss": 0.0006, "num_tokens": 55134700.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999659657478333, "sampling/importance_sampling_ratio/min": 0.6207950115203857, "sampling/sampling_logp_difference/max": 0.9944319725036621, "sampling/sampling_logp_difference/mean": 0.011734064668416977, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 188.1875, "completions/mean_terminated_length": 188.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2525515556335449, "epoch": 2.138480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.4233141311733437, "kl": 0.09250284731388092, "learning_rate": 2.329292093375356e-07, "loss": 0.0167, "num_tokens": 55162600.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7998061180114746, "sampling/importance_sampling_ratio/mean": 0.9999781847000122, "sampling/importance_sampling_ratio/min": 0.27488502860069275, "sampling/sampling_logp_difference/max": 1.2914023399353027, "sampling/sampling_logp_difference/mean": 0.014600848779082298, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 193.71875, "completions/mean_terminated_length": 193.71875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.29777389764785767, "epoch": 2.139705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.7411493991118427, "kl": 0.12776996195316315, "learning_rate": 2.3232723869532816e-07, "loss": -0.0418, "num_tokens": 55193926.0, "reward": 0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997895956039429, "sampling/importance_sampling_ratio/min": 0.3743273615837097, "sampling/sampling_logp_difference/max": 1.0381970405578613, "sampling/sampling_logp_difference/mean": 0.01634758710861206, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 210.3125, "completions/mean_terminated_length": 210.3125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.30516448616981506, "epoch": 2.1409313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.1469881691496775, "kl": 0.1371455043554306, "learning_rate": 2.3172581141186858e-07, "loss": -0.0006, "num_tokens": 55222106.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5972622632980347, "sampling/importance_sampling_ratio/mean": 1.000130534172058, "sampling/importance_sampling_ratio/min": 0.6546982526779175, "sampling/sampling_logp_difference/max": 0.4682910442352295, "sampling/sampling_logp_difference/mean": 0.015357891097664833, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 190.296875, "completions/mean_terminated_length": 190.296875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2997080683708191, "epoch": 2.142156862745098, "frac_reward_zero_std": 0.25, "grad_norm": 2.392021870202034, "kl": 0.11923129856586456, "learning_rate": 2.3112492870801602e-07, "loss": -0.0323, "num_tokens": 55254797.0, "reward": 0.46875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8146553039550781, "sampling/importance_sampling_ratio/mean": 0.9999638795852661, "sampling/importance_sampling_ratio/min": 0.3732207417488098, "sampling/sampling_logp_difference/max": 0.9855852127075195, "sampling/sampling_logp_difference/mean": 0.01580945774912834, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 194.796875, "completions/mean_terminated_length": 194.796875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.22310101985931396, "epoch": 2.1433823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 1.40417591739693, "kl": 0.07776062190532684, "learning_rate": 2.3052459180352458e-07, "loss": 0.052, "num_tokens": 55286880.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.612930417060852, "sampling/importance_sampling_ratio/mean": 1.000563621520996, "sampling/importance_sampling_ratio/min": 0.628268301486969, "sampling/sampling_logp_difference/max": 0.47805261611938477, "sampling/sampling_logp_difference/mean": 0.013772143051028252, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 222.90625, "completions/mean_terminated_length": 222.90625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2597554326057434, "epoch": 2.144607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 1.8060586946344956, "kl": 0.08378937095403671, "learning_rate": 2.2992480191704e-07, "loss": -0.0062, "num_tokens": 55325754.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001883506774902, "sampling/importance_sampling_ratio/min": 0.5075283646583557, "sampling/sampling_logp_difference/max": 0.7715139389038086, "sampling/sampling_logp_difference/mean": 0.014584079384803772, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 189.9375, "completions/mean_terminated_length": 189.9375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.22471821308135986, "epoch": 2.1458333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.637028134535316, "kl": 0.12206414341926575, "learning_rate": 2.2932556026609777e-07, "loss": 0.0076, "num_tokens": 55359654.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6438500881195068, "sampling/importance_sampling_ratio/mean": 1.0002222061157227, "sampling/importance_sampling_ratio/min": 0.4182717204093933, "sampling/sampling_logp_difference/max": 0.8716239929199219, "sampling/sampling_logp_difference/mean": 0.013608250766992569, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 222.109375, "completions/mean_terminated_length": 222.109375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2933546304702759, "epoch": 2.1470588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.6332167204363761, "kl": 0.08434764295816422, "learning_rate": 2.2872686806712032e-07, "loss": 0.0377, "num_tokens": 55396189.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.612004280090332, "sampling/importance_sampling_ratio/mean": 0.9994946718215942, "sampling/importance_sampling_ratio/min": 0.5971236824989319, "sampling/sampling_logp_difference/max": 0.5156309604644775, "sampling/sampling_logp_difference/mean": 0.015469004400074482, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 191.203125, "completions/mean_terminated_length": 191.203125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.31936416029930115, "epoch": 2.1482843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.5481373616086007, "kl": 0.09290601313114166, "learning_rate": 2.2812872653541498e-07, "loss": -0.0505, "num_tokens": 55432778.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003182888031006, "sampling/importance_sampling_ratio/min": 0.3702169358730316, "sampling/sampling_logp_difference/max": 2.06929612159729, "sampling/sampling_logp_difference/mean": 0.018548715859651566, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 187.484375, "completions/mean_terminated_length": 187.484375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.29913410544395447, "epoch": 2.1495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 2.4493544700953307, "kl": 0.11844295263290405, "learning_rate": 2.2753113688517155e-07, "loss": -0.0129, "num_tokens": 55468217.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.9188014268875122, "sampling/importance_sampling_ratio/mean": 1.0003360509872437, "sampling/importance_sampling_ratio/min": 0.5287709832191467, "sampling/sampling_logp_difference/max": 0.6517007350921631, "sampling/sampling_logp_difference/mean": 0.015478193759918213, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 212.59375, "completions/mean_terminated_length": 212.59375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3491870164871216, "epoch": 2.150735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.395665365436369, "kl": 0.10798598825931549, "learning_rate": 2.2693410032945853e-07, "loss": -0.0155, "num_tokens": 55504319.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6156541109085083, "sampling/importance_sampling_ratio/mean": 1.0005502700805664, "sampling/importance_sampling_ratio/min": 0.603910505771637, "sampling/sampling_logp_difference/max": 0.5043292045593262, "sampling/sampling_logp_difference/mean": 0.017521340399980545, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 197.84375, "completions/mean_terminated_length": 197.84375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.26326245069503784, "epoch": 2.1519607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.9120040970065213, "kl": 0.09008113294839859, "learning_rate": 2.2633761808022272e-07, "loss": -0.0423, "num_tokens": 55537045.0, "reward": 0.46875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997286200523376, "sampling/importance_sampling_ratio/min": 0.5443238019943237, "sampling/sampling_logp_difference/max": 0.8299136161804199, "sampling/sampling_logp_difference/mean": 0.014764741994440556, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 181.09375, "completions/mean_terminated_length": 181.09375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.24075452983379364, "epoch": 2.153186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.2333513103896843, "kl": 0.12325990200042725, "learning_rate": 2.2574169134828526e-07, "loss": 0.0104, "num_tokens": 55563707.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6210919618606567, "sampling/importance_sampling_ratio/mean": 1.0004346370697021, "sampling/importance_sampling_ratio/min": 0.5685502290725708, "sampling/sampling_logp_difference/max": 0.5646655559539795, "sampling/sampling_logp_difference/mean": 0.013588340021669865, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 222.734375, "completions/mean_terminated_length": 222.734375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2973715364933014, "epoch": 2.1544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.05236871317810249, "kl": 0.11249151825904846, "learning_rate": 2.2514632134333932e-07, "loss": 0.0011, "num_tokens": 55595642.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6206932067871094, "sampling/importance_sampling_ratio/mean": 1.000004529953003, "sampling/importance_sampling_ratio/min": 0.4218224287033081, "sampling/sampling_logp_difference/max": 0.863170862197876, "sampling/sampling_logp_difference/mean": 0.01540279109030962, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 216.3125, "completions/mean_terminated_length": 216.3125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24999091029167175, "epoch": 2.155637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.4996968531414558, "kl": 0.07931873947381973, "learning_rate": 2.2455150927394878e-07, "loss": 0.0293, "num_tokens": 55627822.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.872873306274414, "sampling/importance_sampling_ratio/mean": 0.9997075200080872, "sampling/importance_sampling_ratio/min": 0.5429760813713074, "sampling/sampling_logp_difference/max": 0.6274738311767578, "sampling/sampling_logp_difference/mean": 0.013102727010846138, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 194.703125, "completions/mean_terminated_length": 194.703125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.28348392248153687, "epoch": 2.156862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.1725029071104536, "kl": 0.09887401759624481, "learning_rate": 2.2395725634754402e-07, "loss": -0.0066, "num_tokens": 55659995.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5785914659500122, "sampling/importance_sampling_ratio/mean": 1.00052809715271, "sampling/importance_sampling_ratio/min": 0.6561868190765381, "sampling/sampling_logp_difference/max": 0.45653295516967773, "sampling/sampling_logp_difference/mean": 0.014698462560772896, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 241.71875, "completions/mean_terminated_length": 241.71875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.19589178264141083, "epoch": 2.1580882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.1591875727037666, "kl": 0.05602003633975983, "learning_rate": 2.2336356377042143e-07, "loss": 0.0176, "num_tokens": 55690489.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5980969667434692, "sampling/importance_sampling_ratio/mean": 0.9999008178710938, "sampling/importance_sampling_ratio/min": 0.40407630801200867, "sampling/sampling_logp_difference/max": 0.906151533126831, "sampling/sampling_logp_difference/mean": 0.011489280499517918, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 159.796875, "completions/mean_terminated_length": 159.796875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.21925659477710724, "epoch": 2.159313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.09653099240163052, "kl": 0.08598222583532333, "learning_rate": 2.2277043274773854e-07, "loss": 0.0008, "num_tokens": 55719084.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.790588140487671, "sampling/importance_sampling_ratio/mean": 1.0003536939620972, "sampling/importance_sampling_ratio/min": 0.6108417510986328, "sampling/sampling_logp_difference/max": 0.5825440883636475, "sampling/sampling_logp_difference/mean": 0.014810223132371902, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 206.28125, "completions/mean_terminated_length": 206.28125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.23432119190692902, "epoch": 2.1605392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.1632752163632758, "kl": 0.05296270549297333, "learning_rate": 2.221778644835144e-07, "loss": -0.0043, "num_tokens": 55748526.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.944764494895935, "sampling/importance_sampling_ratio/mean": 0.9997674822807312, "sampling/importance_sampling_ratio/min": 0.44626033306121826, "sampling/sampling_logp_difference/max": 0.8068528175354004, "sampling/sampling_logp_difference/mean": 0.0140206478536129, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 183.28125, "completions/mean_terminated_length": 183.28125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.22710749506950378, "epoch": 2.161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0621859740927757, "kl": 0.07478615641593933, "learning_rate": 2.215858601806246e-07, "loss": 0.0007, "num_tokens": 55775776.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999275207519531, "sampling/importance_sampling_ratio/min": 0.5677725672721863, "sampling/sampling_logp_difference/max": 0.8884508609771729, "sampling/sampling_logp_difference/mean": 0.01352921687066555, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 162.609375, "completions/mean_terminated_length": 162.609375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2223511040210724, "epoch": 2.1629901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 2.048509211810651, "kl": 0.0845133364200592, "learning_rate": 2.2099442104080075e-07, "loss": -0.0478, "num_tokens": 55800503.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5168209075927734, "sampling/importance_sampling_ratio/mean": 1.0006378889083862, "sampling/importance_sampling_ratio/min": 0.5227040648460388, "sampling/sampling_logp_difference/max": 0.6487398147583008, "sampling/sampling_logp_difference/mean": 0.012828944250941277, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 221.328125, "completions/mean_terminated_length": 221.328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2978760004043579, "epoch": 2.1642156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.0565074687144558, "kl": 0.10850057005882263, "learning_rate": 2.2040354826462664e-07, "loss": -0.0032, "num_tokens": 55835996.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9099600315093994, "sampling/importance_sampling_ratio/mean": 0.9992789030075073, "sampling/importance_sampling_ratio/min": 0.28735923767089844, "sampling/sampling_logp_difference/max": 1.2470221519470215, "sampling/sampling_logp_difference/mean": 0.01584113948047161, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 155.53125, "completions/mean_terminated_length": 155.53125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.17103657126426697, "epoch": 2.1654411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.5059775577140329, "kl": 0.06305205821990967, "learning_rate": 2.1981324305153642e-07, "loss": 0.0093, "num_tokens": 55861022.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9056557416915894, "sampling/importance_sampling_ratio/mean": 0.9994888305664062, "sampling/importance_sampling_ratio/min": 0.5924586057662964, "sampling/sampling_logp_difference/max": 0.6448261737823486, "sampling/sampling_logp_difference/mean": 0.012093236669898033, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 233.390625, "completions/mean_terminated_length": 233.390625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3735348582267761, "epoch": 2.1666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 2.4999732441846727, "kl": 0.14664384722709656, "learning_rate": 2.192235065998126e-07, "loss": 0.0184, "num_tokens": 55894487.0, "reward": 0.03125, "reward_std": 0.7744960784912109, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9880139827728271, "sampling/importance_sampling_ratio/mean": 0.9999355673789978, "sampling/importance_sampling_ratio/min": 0.45969530940055847, "sampling/sampling_logp_difference/max": 0.7771914005279541, "sampling/sampling_logp_difference/mean": 0.0183719452470541, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 194.4375, "completions/mean_terminated_length": 194.4375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.22018903493881226, "epoch": 2.167892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0396575520521913, "kl": 0.05755123496055603, "learning_rate": 2.1863434010658272e-07, "loss": 0.0006, "num_tokens": 55925059.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6318347454071045, "sampling/importance_sampling_ratio/mean": 1.0003325939178467, "sampling/importance_sampling_ratio/min": 0.5893513560295105, "sampling/sampling_logp_difference/max": 0.5287327766418457, "sampling/sampling_logp_difference/mean": 0.013786889612674713, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 193.703125, "completions/mean_terminated_length": 193.703125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2954903841018677, "epoch": 2.1691176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 2.140746619090501, "kl": 0.1304420530796051, "learning_rate": 2.1804574476781733e-07, "loss": 0.0963, "num_tokens": 55953584.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7958390712738037, "sampling/importance_sampling_ratio/mean": 1.0007104873657227, "sampling/importance_sampling_ratio/min": 0.27278798818588257, "sampling/sampling_logp_difference/max": 1.2990604639053345, "sampling/sampling_logp_difference/mean": 0.015290407463908195, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 164.6875, "completions/mean_terminated_length": 164.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.17861588299274445, "epoch": 2.170343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.6114393341212648, "kl": 0.10109886527061462, "learning_rate": 2.1745772177832755e-07, "loss": 0.001, "num_tokens": 55983580.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6309486627578735, "sampling/importance_sampling_ratio/mean": 0.9999058246612549, "sampling/importance_sampling_ratio/min": 0.5794409513473511, "sampling/sampling_logp_difference/max": 0.5456914901733398, "sampling/sampling_logp_difference/mean": 0.010914549231529236, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 181.671875, "completions/mean_terminated_length": 181.671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.19727925956249237, "epoch": 2.1715686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.04535562195449047, "kl": 0.05515056103467941, "learning_rate": 2.1687027233176318e-07, "loss": 0.0005, "num_tokens": 56009639.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8589531183242798, "sampling/importance_sampling_ratio/mean": 1.0001940727233887, "sampling/importance_sampling_ratio/min": 0.44680851697921753, "sampling/sampling_logp_difference/max": 0.8056252002716064, "sampling/sampling_logp_difference/mean": 0.013086721301078796, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 222.03125, "completions/mean_terminated_length": 222.03125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.23033404350280762, "epoch": 2.172794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.3009567177394339, "kl": 0.09654970467090607, "learning_rate": 2.1628339762060914e-07, "loss": -0.112, "num_tokens": 56043625.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6859341859817505, "sampling/importance_sampling_ratio/mean": 1.0001462697982788, "sampling/importance_sampling_ratio/min": 0.6387834548950195, "sampling/sampling_logp_difference/max": 0.5223197937011719, "sampling/sampling_logp_difference/mean": 0.01319871935993433, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 164.546875, "completions/mean_terminated_length": 164.546875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.24296662211418152, "epoch": 2.174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.1667428988558988, "kl": 0.061865609139204025, "learning_rate": 2.1569709883618382e-07, "loss": -0.0109, "num_tokens": 56077308.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6617133617401123, "sampling/importance_sampling_ratio/mean": 1.000140905380249, "sampling/importance_sampling_ratio/min": 0.44791504740715027, "sampling/sampling_logp_difference/max": 0.8031517267227173, "sampling/sampling_logp_difference/mean": 0.01590428128838539, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 246.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2676558494567871, "epoch": 2.1752450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.1496004248726603, "kl": 0.06484948098659515, "learning_rate": 2.1511137716863687e-07, "loss": 0.0323, "num_tokens": 56114980.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.8461192846298218, "sampling/importance_sampling_ratio/mean": 1.000048279762268, "sampling/importance_sampling_ratio/min": 0.33948543667793274, "sampling/sampling_logp_difference/max": 1.0803241729736328, "sampling/sampling_logp_difference/mean": 0.013982707634568214, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 213.734375, "completions/mean_terminated_length": 213.734375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2765084505081177, "epoch": 2.176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.227470529989764, "kl": 0.14539030194282532, "learning_rate": 2.1452623380694602e-07, "loss": 0.0079, "num_tokens": 56146003.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6098580360412598, "sampling/importance_sampling_ratio/mean": 0.9999431371688843, "sampling/importance_sampling_ratio/min": 0.4871361553668976, "sampling/sampling_logp_difference/max": 0.7192115783691406, "sampling/sampling_logp_difference/mean": 0.014973534271121025, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 245.515625, "completions/mean_terminated_length": 245.515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.24445977807044983, "epoch": 2.1776960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.574898244112307, "kl": 0.10070991516113281, "learning_rate": 2.1394166993891526e-07, "loss": 0.0018, "num_tokens": 56184308.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995533227920532, "sampling/importance_sampling_ratio/min": 0.38582128286361694, "sampling/sampling_logp_difference/max": 2.2083592414855957, "sampling/sampling_logp_difference/mean": 0.013410156592726707, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 216.984375, "completions/mean_terminated_length": 216.984375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2757517099380493, "epoch": 2.178921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.272832905951485, "kl": 0.09829697012901306, "learning_rate": 2.1335768675117205e-07, "loss": 0.0277, "num_tokens": 56217827.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6953853368759155, "sampling/importance_sampling_ratio/mean": 0.9995964765548706, "sampling/importance_sampling_ratio/min": 0.3189980685710907, "sampling/sampling_logp_difference/max": 1.1425702571868896, "sampling/sampling_logp_difference/mean": 0.014155558310449123, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 221.234375, "completions/mean_terminated_length": 221.234375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3149792551994324, "epoch": 2.1801470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 2.166820660842093, "kl": 0.1276342123746872, "learning_rate": 2.1277428542916555e-07, "loss": -0.0022, "num_tokens": 56251330.0, "reward": 0.03125, "reward_std": 0.5959457159042358, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.7731382846832275, "sampling/importance_sampling_ratio/mean": 1.0004637241363525, "sampling/importance_sampling_ratio/min": 0.4084855020046234, "sampling/sampling_logp_difference/max": 0.8952988386154175, "sampling/sampling_logp_difference/mean": 0.017157146707177162, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 229.9375, "completions/mean_terminated_length": 229.9375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2507379651069641, "epoch": 2.1813725490196076, "frac_reward_zero_std": 0.75, "grad_norm": 0.7191440763102566, "kl": 0.06998881697654724, "learning_rate": 2.121914671571633e-07, "loss": -0.0332, "num_tokens": 56280318.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4447228908538818, "sampling/importance_sampling_ratio/mean": 1.0003588199615479, "sampling/importance_sampling_ratio/min": 0.5850769281387329, "sampling/sampling_logp_difference/max": 0.5360119342803955, "sampling/sampling_logp_difference/mean": 0.013089507818222046, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 159.140625, "completions/mean_terminated_length": 159.140625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.20331966876983643, "epoch": 2.1825980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.047057333445381995, "kl": 0.07965545356273651, "learning_rate": 2.1160923311824934e-07, "loss": 0.0008, "num_tokens": 56308743.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8327713012695312, "sampling/importance_sampling_ratio/mean": 0.9995776414871216, "sampling/importance_sampling_ratio/min": 0.53630131483078, "sampling/sampling_logp_difference/max": 0.6230590343475342, "sampling/sampling_logp_difference/mean": 0.012753108516335487, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 170.953125, "completions/mean_terminated_length": 170.953125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2831561863422394, "epoch": 2.1838235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.8795245260456575, "kl": 0.1338336020708084, "learning_rate": 2.110275844943223e-07, "loss": -0.0086, "num_tokens": 56334868.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7142467498779297, "sampling/importance_sampling_ratio/mean": 1.000351071357727, "sampling/importance_sampling_ratio/min": 0.6327558159828186, "sampling/sampling_logp_difference/max": 0.5389738082885742, "sampling/sampling_logp_difference/mean": 0.015494199469685555, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 143.984375, "completions/mean_terminated_length": 143.984375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.12787416577339172, "epoch": 2.185049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.04899348871208142, "kl": 0.052631497383117676, "learning_rate": 2.1044652246609173e-07, "loss": 0.0005, "num_tokens": 56356595.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002912282943726, "sampling/importance_sampling_ratio/min": 0.41095978021621704, "sampling/sampling_logp_difference/max": 0.8892599940299988, "sampling/sampling_logp_difference/mean": 0.00995919480919838, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 227.453125, "completions/mean_terminated_length": 227.453125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.24770432710647583, "epoch": 2.186274509803922, "frac_reward_zero_std": 0.25, "grad_norm": 1.850225326570932, "kl": 0.12453025579452515, "learning_rate": 2.098660482130768e-07, "loss": 0.0561, "num_tokens": 56384320.0, "reward": 0.6875, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6594297885894775, "sampling/importance_sampling_ratio/mean": 1.0000152587890625, "sampling/importance_sampling_ratio/min": 0.5000889301300049, "sampling/sampling_logp_difference/max": 0.6929693222045898, "sampling/sampling_logp_difference/mean": 0.013516264036297798, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 153.203125, "completions/mean_terminated_length": 153.203125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.25445735454559326, "epoch": 2.1875, "frac_reward_zero_std": 0.75, "grad_norm": 2.1408699212596036, "kl": 0.11505749076604843, "learning_rate": 2.092861629136033e-07, "loss": -0.0087, "num_tokens": 56410637.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6286143064498901, "sampling/importance_sampling_ratio/mean": 0.9998992681503296, "sampling/importance_sampling_ratio/min": 0.49341726303100586, "sampling/sampling_logp_difference/max": 0.7064000964164734, "sampling/sampling_logp_difference/mean": 0.016297511756420135, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 198.578125, "completions/mean_terminated_length": 198.578125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2938719391822815, "epoch": 2.188725490196078, "frac_reward_zero_std": 0.5, "grad_norm": 1.3957152626390525, "kl": 0.12690046429634094, "learning_rate": 2.0870686774480196e-07, "loss": 0.11, "num_tokens": 56440898.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7653018236160278, "sampling/importance_sampling_ratio/mean": 1.0002646446228027, "sampling/importance_sampling_ratio/min": 0.5458997488021851, "sampling/sampling_logp_difference/max": 0.6053199768066406, "sampling/sampling_logp_difference/mean": 0.016641918569803238, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 219.84375, "completions/mean_terminated_length": 219.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27959156036376953, "epoch": 2.189950980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.8963628929077574, "kl": 0.09853015840053558, "learning_rate": 2.0812816388260519e-07, "loss": 0.0119, "num_tokens": 56475800.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.654282569885254, "sampling/importance_sampling_ratio/mean": 1.0007174015045166, "sampling/importance_sampling_ratio/min": 0.5283967852592468, "sampling/sampling_logp_difference/max": 0.6379077434539795, "sampling/sampling_logp_difference/mean": 0.014839932322502136, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 204.8125, "completions/mean_terminated_length": 204.8125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2499319463968277, "epoch": 2.1911764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.0891285918951858, "kl": 0.09234219789505005, "learning_rate": 2.0755005250174484e-07, "loss": 0.0214, "num_tokens": 56507372.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9342883825302124, "sampling/importance_sampling_ratio/mean": 1.000509262084961, "sampling/importance_sampling_ratio/min": 0.4982292950153351, "sampling/sampling_logp_difference/max": 0.6966948509216309, "sampling/sampling_logp_difference/mean": 0.01440890971571207, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.24660606682300568, "epoch": 2.1924019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.839575214882834, "kl": 0.12259334325790405, "learning_rate": 2.0697253477575088e-07, "loss": -0.0009, "num_tokens": 56534348.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7618639469146729, "sampling/importance_sampling_ratio/mean": 1.0006442070007324, "sampling/importance_sampling_ratio/min": 0.6075704097747803, "sampling/sampling_logp_difference/max": 0.5663723945617676, "sampling/sampling_logp_difference/mean": 0.014180932193994522, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 230.015625, "completions/mean_terminated_length": 230.015625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2670382559299469, "epoch": 2.1936274509803924, "frac_reward_zero_std": 0.5, "grad_norm": 1.2174885407937754, "kl": 0.07931780815124512, "learning_rate": 2.0639561187694733e-07, "loss": -0.0433, "num_tokens": 56564813.0, "reward": 0.1875, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.575170636177063, "sampling/importance_sampling_ratio/mean": 1.0001091957092285, "sampling/importance_sampling_ratio/min": 0.6236374378204346, "sampling/sampling_logp_difference/max": 0.4721860885620117, "sampling/sampling_logp_difference/mean": 0.012448785826563835, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 202.734375, "completions/mean_terminated_length": 202.734375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2719419598579407, "epoch": 2.1948529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 2.4177688377458115, "kl": 0.1150088682770729, "learning_rate": 2.0581928497645164e-07, "loss": -0.0505, "num_tokens": 56596012.0, "reward": 0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8363903760910034, "sampling/importance_sampling_ratio/mean": 0.999383270740509, "sampling/importance_sampling_ratio/min": 0.47761914134025574, "sampling/sampling_logp_difference/max": 0.7389416694641113, "sampling/sampling_logp_difference/mean": 0.014551948755979538, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 156.859375, "completions/mean_terminated_length": 156.859375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.24167387187480927, "epoch": 2.196078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.2773096667814843, "kl": 0.08351308852434158, "learning_rate": 2.0524355524417015e-07, "loss": -0.009, "num_tokens": 56623939.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7385048866271973, "sampling/importance_sampling_ratio/mean": 1.0004111528396606, "sampling/importance_sampling_ratio/min": 0.5637833476066589, "sampling/sampling_logp_difference/max": 0.5730853080749512, "sampling/sampling_logp_difference/mean": 0.01441868208348751, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 195.546875, "completions/mean_terminated_length": 195.546875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.30555734038352966, "epoch": 2.1973039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.1106148320527351, "kl": 0.11144275963306427, "learning_rate": 2.0466842384879829e-07, "loss": -0.0079, "num_tokens": 56653462.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5820528268814087, "sampling/importance_sampling_ratio/mean": 1.0003950595855713, "sampling/importance_sampling_ratio/min": 0.5864598155021667, "sampling/sampling_logp_difference/max": 0.5336510539054871, "sampling/sampling_logp_difference/mean": 0.015375595539808273, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.24411731958389282, "epoch": 2.198529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.7208697359651974, "kl": 0.09210469573736191, "learning_rate": 2.0409389195781623e-07, "loss": -0.0595, "num_tokens": 56686430.0, "reward": -0.1875, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000260353088379, "sampling/importance_sampling_ratio/min": 0.5911000967025757, "sampling/sampling_logp_difference/max": 0.8386600017547607, "sampling/sampling_logp_difference/mean": 0.013181643560528755, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 164.65625, "completions/mean_terminated_length": 164.65625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.25683221220970154, "epoch": 2.1997549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.5931127315147342, "kl": 0.11360817402601242, "learning_rate": 2.0351996073748713e-07, "loss": -0.0039, "num_tokens": 56718152.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000403642654419, "sampling/importance_sampling_ratio/min": 0.5696611404418945, "sampling/sampling_logp_difference/max": 0.7048375606536865, "sampling/sampling_logp_difference/mean": 0.015989817678928375, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 242.453125, "completions/mean_terminated_length": 242.453125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.26049256324768066, "epoch": 2.200980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.888650218870308, "kl": 0.08447247743606567, "learning_rate": 2.0294663135285533e-07, "loss": -0.0133, "num_tokens": 56752853.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8217499256134033, "sampling/importance_sampling_ratio/mean": 1.0003026723861694, "sampling/importance_sampling_ratio/min": 0.6241431832313538, "sampling/sampling_logp_difference/max": 0.5997974872589111, "sampling/sampling_logp_difference/mean": 0.013664944097399712, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 182.078125, "completions/mean_terminated_length": 182.078125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2810284495353699, "epoch": 2.202205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.9051843705444376, "kl": 0.12366537749767303, "learning_rate": 2.0237390496774282e-07, "loss": 0.0259, "num_tokens": 56778938.0, "reward": 0.34375, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.9871368408203125, "sampling/importance_sampling_ratio/mean": 1.0005348920822144, "sampling/importance_sampling_ratio/min": 0.49550315737724304, "sampling/sampling_logp_difference/max": 0.7021815776824951, "sampling/sampling_logp_difference/mean": 0.016560807824134827, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 184.0625, "completions/mean_terminated_length": 184.0625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.22490039467811584, "epoch": 2.2034313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.056413215260071876, "kl": 0.07760661840438843, "learning_rate": 2.0180178274474834e-07, "loss": 0.0008, "num_tokens": 56811998.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.900390386581421, "sampling/importance_sampling_ratio/mean": 0.999333918094635, "sampling/importance_sampling_ratio/min": 0.5382927656173706, "sampling/sampling_logp_difference/max": 0.642059326171875, "sampling/sampling_logp_difference/mean": 0.013159316033124924, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 158.15625, "completions/mean_terminated_length": 158.15625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.24703152477741241, "epoch": 2.204656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.5164389722404052, "kl": 0.07044568657875061, "learning_rate": 2.012302658452432e-07, "loss": 0.021, "num_tokens": 56837528.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.636584758758545, "sampling/importance_sampling_ratio/mean": 0.9996676445007324, "sampling/importance_sampling_ratio/min": 0.4209079146385193, "sampling/sampling_logp_difference/max": 0.8653411865234375, "sampling/sampling_logp_difference/mean": 0.013793625868856907, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.26092529296875, "epoch": 2.2058823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 2.0319771400909987, "kl": 0.12363815307617188, "learning_rate": 2.0065935542937073e-07, "loss": -0.0257, "num_tokens": 56865660.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.58610200881958, "sampling/importance_sampling_ratio/mean": 1.0004420280456543, "sampling/importance_sampling_ratio/min": 0.5681842565536499, "sampling/sampling_logp_difference/max": 0.5653095245361328, "sampling/sampling_logp_difference/mean": 0.014487011358141899, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 222.96875, "completions/mean_terminated_length": 222.96875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.23370033502578735, "epoch": 2.207107843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.05426284143562628, "kl": 0.08036220073699951, "learning_rate": 2.0008905265604315e-07, "loss": 0.0007, "num_tokens": 56899162.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.767967700958252, "sampling/importance_sampling_ratio/mean": 1.0001204013824463, "sampling/importance_sampling_ratio/min": 0.2905968427658081, "sampling/sampling_logp_difference/max": 1.2358183860778809, "sampling/sampling_logp_difference/mean": 0.013781680725514889, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 167.640625, "completions/mean_terminated_length": 167.640625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.21038201451301575, "epoch": 2.2083333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.7168966777548178, "kl": 0.08649880439043045, "learning_rate": 1.995193586829387e-07, "loss": -0.0401, "num_tokens": 56923827.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6357125043869019, "sampling/importance_sampling_ratio/mean": 1.0007596015930176, "sampling/importance_sampling_ratio/min": 0.6269845962524414, "sampling/sampling_logp_difference/max": 0.4920785427093506, "sampling/sampling_logp_difference/mean": 0.012823158875107765, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 204.765625, "completions/mean_terminated_length": 204.765625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3014012575149536, "epoch": 2.2095588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.7992363173722363, "kl": 0.1286182552576065, "learning_rate": 1.989502746665001e-07, "loss": -0.0384, "num_tokens": 56951876.0, "reward": -0.28125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4636856317520142, "sampling/importance_sampling_ratio/mean": 0.9995325207710266, "sampling/importance_sampling_ratio/min": 0.6095805764198303, "sampling/sampling_logp_difference/max": 0.49498414993286133, "sampling/sampling_logp_difference/mean": 0.014365926384925842, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 198.453125, "completions/mean_terminated_length": 198.453125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.21758437156677246, "epoch": 2.2107843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.325185412232773, "kl": 0.08413703739643097, "learning_rate": 1.9838180176193176e-07, "loss": 0.0533, "num_tokens": 56992097.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.690238118171692, "sampling/importance_sampling_ratio/mean": 1.0002226829528809, "sampling/importance_sampling_ratio/min": 0.29751691222190857, "sampling/sampling_logp_difference/max": 1.2122842073440552, "sampling/sampling_logp_difference/mean": 0.013783496804535389, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.19069215655326843, "epoch": 2.2120098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.5874936688245536, "kl": 0.06194804608821869, "learning_rate": 1.9781394112319787e-07, "loss": -0.1226, "num_tokens": 57017713.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6393591165542603, "sampling/importance_sampling_ratio/mean": 0.999832272529602, "sampling/importance_sampling_ratio/min": 0.6047071814537048, "sampling/sampling_logp_difference/max": 0.5030109882354736, "sampling/sampling_logp_difference/mean": 0.012021776288747787, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 212.65625, "completions/mean_terminated_length": 212.65625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.36637696623802185, "epoch": 2.213235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 2.0582492288032177, "kl": 0.13019004464149475, "learning_rate": 1.9724669390301946e-07, "loss": 0.037, "num_tokens": 57053819.0, "reward": 0.46875, "reward_std": 0.8987700343132019, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5888125896453857, "sampling/importance_sampling_ratio/mean": 0.9998255372047424, "sampling/importance_sampling_ratio/min": 0.5999937057495117, "sampling/sampling_logp_difference/max": 0.510836124420166, "sampling/sampling_logp_difference/mean": 0.01723702810704708, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 165.5625, "completions/mean_terminated_length": 165.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2223309427499771, "epoch": 2.2144607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.5234672161510483, "kl": 0.10448633879423141, "learning_rate": 1.9668006125287228e-07, "loss": 0.0074, "num_tokens": 57079439.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.608222246170044, "sampling/importance_sampling_ratio/mean": 0.9996899366378784, "sampling/importance_sampling_ratio/min": 0.5366796851158142, "sampling/sampling_logp_difference/max": 0.6223537921905518, "sampling/sampling_logp_difference/mean": 0.014071298763155937, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 266.59375, "completions/mean_terminated_length": 266.59375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2737465500831604, "epoch": 2.215686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.5541124850224726, "kl": 0.06044057011604309, "learning_rate": 1.96114044322985e-07, "loss": 0.1126, "num_tokens": 57111269.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.695718765258789, "sampling/importance_sampling_ratio/mean": 0.9998077154159546, "sampling/importance_sampling_ratio/min": 0.537212073802948, "sampling/sampling_logp_difference/max": 0.6213623285293579, "sampling/sampling_logp_difference/mean": 0.01348542608320713, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 179.71875, "completions/mean_terminated_length": 179.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.21518371999263763, "epoch": 2.2169117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.3877720786346066, "kl": 0.06830859929323196, "learning_rate": 1.9554864426233604e-07, "loss": 0.0498, "num_tokens": 57136307.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6554591655731201, "sampling/importance_sampling_ratio/mean": 0.999778151512146, "sampling/importance_sampling_ratio/min": 0.26966822147369385, "sampling/sampling_logp_difference/max": 1.3105628490447998, "sampling/sampling_logp_difference/mean": 0.012849608436226845, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 153.09375, "completions/mean_terminated_length": 153.09375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.23742112517356873, "epoch": 2.218137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 2.4757763573259473, "kl": 0.1102885976433754, "learning_rate": 1.9498386221865165e-07, "loss": 0.0117, "num_tokens": 57159529.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 1.000669240951538, "sampling/importance_sampling_ratio/min": 0.605816125869751, "sampling/sampling_logp_difference/max": 0.5011787414550781, "sampling/sampling_logp_difference/mean": 0.014759579673409462, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 220.546875, "completions/mean_terminated_length": 220.546875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.27240610122680664, "epoch": 2.219362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.06490112940719692, "kl": 0.12128046154975891, "learning_rate": 1.944196993384034e-07, "loss": 0.0011, "num_tokens": 57196924.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6916193962097168, "sampling/importance_sampling_ratio/mean": 1.000105857849121, "sampling/importance_sampling_ratio/min": 0.6148074865341187, "sampling/sampling_logp_difference/max": 0.5256862640380859, "sampling/sampling_logp_difference/mean": 0.017099231481552124, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 162.5625, "completions/mean_terminated_length": 162.5625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.29277801513671875, "epoch": 2.2205882352941178, "frac_reward_zero_std": 0.25, "grad_norm": 2.493362889973553, "kl": 0.13007113337516785, "learning_rate": 1.9385615676680661e-07, "loss": -0.0134, "num_tokens": 57223968.0, "reward": 0.5625, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4952139854431152, "sampling/importance_sampling_ratio/mean": 1.0002390146255493, "sampling/importance_sampling_ratio/min": 0.4914461374282837, "sampling/sampling_logp_difference/max": 0.7104029655456543, "sampling/sampling_logp_difference/mean": 0.015847956761717796, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 208.328125, "completions/mean_terminated_length": 208.328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24020692706108093, "epoch": 2.221813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.07127336127746393, "kl": 0.12844060361385345, "learning_rate": 1.932932356478168e-07, "loss": 0.001, "num_tokens": 57253189.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5585031509399414, "sampling/importance_sampling_ratio/mean": 1.0003728866577148, "sampling/importance_sampling_ratio/min": 0.33361685276031494, "sampling/sampling_logp_difference/max": 1.097762107849121, "sampling/sampling_logp_difference/mean": 0.012762902304530144, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 173.203125, "completions/mean_terminated_length": 173.203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2723970413208008, "epoch": 2.2230392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 2.1579952625133267, "kl": 0.1221776232123375, "learning_rate": 1.9273093712412796e-07, "loss": 0.0255, "num_tokens": 57281970.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7064048051834106, "sampling/importance_sampling_ratio/mean": 0.9998112916946411, "sampling/importance_sampling_ratio/min": 0.22986814379692078, "sampling/sampling_logp_difference/max": 1.4702494144439697, "sampling/sampling_logp_difference/mean": 0.0177859365940094, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 185.4375, "completions/mean_terminated_length": 185.4375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.23902729153633118, "epoch": 2.224264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.202025891469844, "kl": 0.09321679174900055, "learning_rate": 1.9216926233717084e-07, "loss": 0.0081, "num_tokens": 57313502.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999767541885376, "sampling/importance_sampling_ratio/min": 0.3941425383090973, "sampling/sampling_logp_difference/max": 0.9310426712036133, "sampling/sampling_logp_difference/mean": 0.013515518978238106, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2674904465675354, "epoch": 2.2254901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.050382867341168, "kl": 0.09484975039958954, "learning_rate": 1.9160821242710957e-07, "loss": -0.0138, "num_tokens": 57342350.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6207305192947388, "sampling/importance_sampling_ratio/mean": 1.0005056858062744, "sampling/importance_sampling_ratio/min": 0.6129491329193115, "sampling/sampling_logp_difference/max": 0.4894733428955078, "sampling/sampling_logp_difference/mean": 0.013506803661584854, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 227.984375, "completions/mean_terminated_length": 227.984375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.24355986714363098, "epoch": 2.2267156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.2725728688716063, "kl": 0.08003943413496017, "learning_rate": 1.9104778853283987e-07, "loss": 0.0237, "num_tokens": 57374413.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8314207792282104, "sampling/importance_sampling_ratio/mean": 1.0002812147140503, "sampling/importance_sampling_ratio/min": 0.6203051209449768, "sampling/sampling_logp_difference/max": 0.6050920486450195, "sampling/sampling_logp_difference/mean": 0.013915604911744595, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 189.109375, "completions/mean_terminated_length": 189.109375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3851677179336548, "epoch": 2.2279411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 2.4054685081200353, "kl": 0.19514328241348267, "learning_rate": 1.9048799179198655e-07, "loss": -0.0561, "num_tokens": 57400340.0, "reward": -0.25, "reward_std": 0.6972135901451111, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5971453189849854, "sampling/importance_sampling_ratio/mean": 0.9996978640556335, "sampling/importance_sampling_ratio/min": 0.4955294728279114, "sampling/sampling_logp_difference/max": 0.7021284103393555, "sampling/sampling_logp_difference/mean": 0.018159667029976845, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 185.953125, "completions/mean_terminated_length": 185.953125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.1858978122472763, "epoch": 2.2291666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 1.7616215753387285, "kl": 0.07825511693954468, "learning_rate": 1.8992882334090188e-07, "loss": 0.0884, "num_tokens": 57426929.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000319242477417, "sampling/importance_sampling_ratio/min": 0.12653733789920807, "sampling/sampling_logp_difference/max": 2.0672178268432617, "sampling/sampling_logp_difference/mean": 0.012095373123884201, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 189.46875, "completions/mean_terminated_length": 189.46875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2210862636566162, "epoch": 2.230392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.0660557656364722, "kl": 0.06625708937644958, "learning_rate": 1.893702843146623e-07, "loss": 0.0354, "num_tokens": 57459279.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5971460342407227, "sampling/importance_sampling_ratio/mean": 0.9999723434448242, "sampling/importance_sampling_ratio/min": 0.48653677105903625, "sampling/sampling_logp_difference/max": 0.7204427719116211, "sampling/sampling_logp_difference/mean": 0.012377100065350533, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 183.109375, "completions/mean_terminated_length": 183.109375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2027810513973236, "epoch": 2.2316176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.09053463841587767, "kl": 0.06789502501487732, "learning_rate": 1.8881237584706632e-07, "loss": 0.0006, "num_tokens": 57487414.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.627347469329834, "sampling/importance_sampling_ratio/mean": 0.9995033740997314, "sampling/importance_sampling_ratio/min": 0.4853755533695221, "sampling/sampling_logp_difference/max": 0.7228323221206665, "sampling/sampling_logp_difference/mean": 0.01381603628396988, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 197.90625, "completions/mean_terminated_length": 197.90625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2434486746788025, "epoch": 2.232843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.344361640714432, "kl": 0.07310117781162262, "learning_rate": 1.8825509907063326e-07, "loss": -0.0194, "num_tokens": 57515728.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5562864542007446, "sampling/importance_sampling_ratio/mean": 0.999854564666748, "sampling/importance_sampling_ratio/min": 0.5838513374328613, "sampling/sampling_logp_difference/max": 0.5381088256835938, "sampling/sampling_logp_difference/mean": 0.013278448022902012, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 5000.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 301.078125, "completions/mean_terminated_length": 226.49208068847656, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2564546465873718, "epoch": 2.2340686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.7483155096265234, "kl": 0.08983118087053299, "learning_rate": 1.8769845511659927e-07, "loss": 0.4925, "num_tokens": 57553253.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999797344207764, "sampling/importance_sampling_ratio/min": 0.5528501868247986, "sampling/sampling_logp_difference/max": 0.79681396484375, "sampling/sampling_logp_difference/mean": 0.013430179096758366, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 174.96875, "completions/mean_terminated_length": 174.96875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2382800132036209, "epoch": 2.235294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 2.1293663149804734, "kl": 0.10047850757837296, "learning_rate": 1.871424451149169e-07, "loss": 0.0514, "num_tokens": 57582243.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.62760591506958, "sampling/importance_sampling_ratio/mean": 0.9997063875198364, "sampling/importance_sampling_ratio/min": 0.5972736477851868, "sampling/sampling_logp_difference/max": 0.5153799057006836, "sampling/sampling_logp_difference/mean": 0.014239577576518059, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 235.890625, "completions/mean_terminated_length": 235.890625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.30149415135383606, "epoch": 2.236519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.9845678835254071, "kl": 0.15238609910011292, "learning_rate": 1.865870701942504e-07, "loss": -0.0024, "num_tokens": 57620396.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6185318231582642, "sampling/importance_sampling_ratio/mean": 1.0003349781036377, "sampling/importance_sampling_ratio/min": 0.420204222202301, "sampling/sampling_logp_difference/max": 0.8670144081115723, "sampling/sampling_logp_difference/mean": 0.01578395627439022, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 203.4375, "completions/mean_terminated_length": 203.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2554306387901306, "epoch": 2.2377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.07020997658094677, "kl": 0.11225457489490509, "learning_rate": 1.8603233148197632e-07, "loss": 0.0011, "num_tokens": 57652008.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5684692859649658, "sampling/importance_sampling_ratio/mean": 0.9999103546142578, "sampling/importance_sampling_ratio/min": 0.6260828971862793, "sampling/sampling_logp_difference/max": 0.46827244758605957, "sampling/sampling_logp_difference/mean": 0.01270446926355362, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 213.203125, "completions/mean_terminated_length": 213.203125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2989576458930969, "epoch": 2.238970588235294, "frac_reward_zero_std": 0.25, "grad_norm": 1.816139110366677, "kl": 0.1348409354686737, "learning_rate": 1.8547823010417873e-07, "loss": -0.0742, "num_tokens": 57680149.0, "reward": 0.40625, "reward_std": 0.6205305457115173, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6038249731063843, "sampling/importance_sampling_ratio/mean": 1.0002083778381348, "sampling/importance_sampling_ratio/min": 0.577558159828186, "sampling/sampling_logp_difference/max": 0.5489461421966553, "sampling/sampling_logp_difference/mean": 0.014283552765846252, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 222.828125, "completions/mean_terminated_length": 222.828125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2774503529071808, "epoch": 2.2401960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.0941952484861721, "kl": 0.10034649819135666, "learning_rate": 1.8492476718564866e-07, "loss": 0.0006, "num_tokens": 57713482.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5491840839385986, "sampling/importance_sampling_ratio/mean": 1.0000077486038208, "sampling/importance_sampling_ratio/min": 0.4728690981864929, "sampling/sampling_logp_difference/max": 0.748936653137207, "sampling/sampling_logp_difference/mean": 0.014115766622126102, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 246.484375, "completions/mean_terminated_length": 246.484375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2916295528411865, "epoch": 2.241421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.06366216926669424, "kl": 0.12240947037935257, "learning_rate": 1.8437194384988058e-07, "loss": 0.0011, "num_tokens": 57747209.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999750018119812, "sampling/importance_sampling_ratio/min": 0.32487982511520386, "sampling/sampling_logp_difference/max": 1.1242998838424683, "sampling/sampling_logp_difference/mean": 0.015239045023918152, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 157.109375, "completions/mean_terminated_length": 157.109375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.21737360954284668, "epoch": 2.2426470588235294, "frac_reward_zero_std": 0.25, "grad_norm": 2.5352442426278357, "kl": 0.08680347353219986, "learning_rate": 1.8381976121907067e-07, "loss": 0.0733, "num_tokens": 57772976.0, "reward": 0.46875, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4428476095199585, "sampling/importance_sampling_ratio/mean": 0.9994739294052124, "sampling/importance_sampling_ratio/min": 0.5038317441940308, "sampling/sampling_logp_difference/max": 0.6855130195617676, "sampling/sampling_logp_difference/mean": 0.012364721857011318, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 239.1875, "completions/mean_terminated_length": 239.1875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.18108172714710236, "epoch": 2.2438725490196076, "frac_reward_zero_std": 0.75, "grad_norm": 1.578981437829289, "kl": 0.07498110085725784, "learning_rate": 1.832682204141152e-07, "loss": 0.0199, "num_tokens": 57806796.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7198060750961304, "sampling/importance_sampling_ratio/mean": 1.0001530647277832, "sampling/importance_sampling_ratio/min": 0.5394806265830994, "sampling/sampling_logp_difference/max": 0.6171483993530273, "sampling/sampling_logp_difference/mean": 0.011187737807631493, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 155.796875, "completions/mean_terminated_length": 155.796875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2116914689540863, "epoch": 2.2450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.06012952864614764, "kl": 0.09823980182409286, "learning_rate": 1.8271732255460643e-07, "loss": 0.001, "num_tokens": 57834287.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8599026203155518, "sampling/importance_sampling_ratio/mean": 0.9994688034057617, "sampling/importance_sampling_ratio/min": 0.6174707412719727, "sampling/sampling_logp_difference/max": 0.6205241680145264, "sampling/sampling_logp_difference/mean": 0.012800629250705242, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 261.015625, "completions/mean_terminated_length": 261.015625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.27193963527679443, "epoch": 2.2463235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.5690034865510771, "kl": 0.05566692352294922, "learning_rate": 1.8216706875883252e-07, "loss": -0.0431, "num_tokens": 57868048.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.595842719078064, "sampling/importance_sampling_ratio/mean": 0.9992786645889282, "sampling/importance_sampling_ratio/min": 0.6169748306274414, "sampling/sampling_logp_difference/max": 0.4829270839691162, "sampling/sampling_logp_difference/mean": 0.014392347075045109, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 193.40625, "completions/mean_terminated_length": 193.40625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.21245065331459045, "epoch": 2.247549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.239601729534302, "kl": 0.06684806942939758, "learning_rate": 1.816174601437736e-07, "loss": -0.0558, "num_tokens": 57899866.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7529635429382324, "sampling/importance_sampling_ratio/mean": 1.0001167058944702, "sampling/importance_sampling_ratio/min": 0.54033362865448, "sampling/sampling_logp_difference/max": 0.6155685782432556, "sampling/sampling_logp_difference/mean": 0.012661349959671497, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 248.296875, "completions/mean_terminated_length": 248.296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3864680528640747, "epoch": 2.248774509803922, "frac_reward_zero_std": 0.5, "grad_norm": 1.7114122116645036, "kl": 0.14726883172988892, "learning_rate": 1.8106849782510058e-07, "loss": -0.0014, "num_tokens": 57933629.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4798548221588135, "sampling/importance_sampling_ratio/mean": 0.9994147419929504, "sampling/importance_sampling_ratio/min": 0.47067582607269287, "sampling/sampling_logp_difference/max": 0.7535857558250427, "sampling/sampling_logp_difference/mean": 0.018671337515115738, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 192.578125, "completions/mean_terminated_length": 192.578125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3123793601989746, "epoch": 2.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.05289987181211492, "kl": 0.09484916180372238, "learning_rate": 1.8052018291717215e-07, "loss": 0.001, "num_tokens": 57969298.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8810420036315918, "sampling/importance_sampling_ratio/mean": 1.0005704164505005, "sampling/importance_sampling_ratio/min": 0.5695523619651794, "sampling/sampling_logp_difference/max": 0.6318259239196777, "sampling/sampling_logp_difference/mean": 0.017064901068806648, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2550053596496582, "epoch": 2.251225490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.03590646972688233, "kl": 0.06980462372303009, "learning_rate": 1.7997251653303247e-07, "loss": 0.0007, "num_tokens": 58004690.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002665519714355, "sampling/importance_sampling_ratio/min": 0.48348575830459595, "sampling/sampling_logp_difference/max": 1.2188178300857544, "sampling/sampling_logp_difference/mean": 0.013812856748700142, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 202.46875, "completions/mean_terminated_length": 202.46875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.23180857300758362, "epoch": 2.252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.04306650140612095, "kl": 0.11366766691207886, "learning_rate": 1.7942549978441012e-07, "loss": 0.0011, "num_tokens": 58036768.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8106759786605835, "sampling/importance_sampling_ratio/mean": 1.0001732110977173, "sampling/importance_sampling_ratio/min": 0.32732874155044556, "sampling/sampling_logp_difference/max": 1.1167902946472168, "sampling/sampling_logp_difference/mean": 0.013028960675001144, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 210.15625, "completions/mean_terminated_length": 210.15625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2635669708251953, "epoch": 2.2536764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.4666512103923888, "kl": 0.09035992622375488, "learning_rate": 1.7887913378171422e-07, "loss": 0.0518, "num_tokens": 58065034.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.7100659608840942, "sampling/importance_sampling_ratio/mean": 1.000524878501892, "sampling/importance_sampling_ratio/min": 0.5668810606002808, "sampling/sampling_logp_difference/max": 0.56760573387146, "sampling/sampling_logp_difference/mean": 0.01479196548461914, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 236.8125, "completions/mean_terminated_length": 236.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.22833718359470367, "epoch": 2.2549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.2875345140570835, "kl": 0.07793845236301422, "learning_rate": 1.783334196340331e-07, "loss": -0.0267, "num_tokens": 58098766.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996142983436584, "sampling/importance_sampling_ratio/min": 0.5325475335121155, "sampling/sampling_logp_difference/max": 0.8491120338439941, "sampling/sampling_logp_difference/mean": 0.012790380977094173, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 158.609375, "completions/mean_terminated_length": 158.609375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.19125020503997803, "epoch": 2.256127450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.06275535771901054, "kl": 0.07269306480884552, "learning_rate": 1.777883584491317e-07, "loss": 0.0007, "num_tokens": 58122853.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.574523687362671, "sampling/importance_sampling_ratio/mean": 0.9985629320144653, "sampling/importance_sampling_ratio/min": 0.3798008859157562, "sampling/sampling_logp_difference/max": 0.9681081771850586, "sampling/sampling_logp_difference/mean": 0.012079785577952862, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.21966275572776794, "epoch": 2.2573529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.2274985334583557, "kl": 0.122444286942482, "learning_rate": 1.7724395133345022e-07, "loss": -0.0055, "num_tokens": 58154525.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 1.0001604557037354, "sampling/importance_sampling_ratio/min": 0.5928127765655518, "sampling/sampling_logp_difference/max": 0.5228766202926636, "sampling/sampling_logp_difference/mean": 0.012999025173485279, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 145.28125, "completions/mean_terminated_length": 145.28125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2318025827407837, "epoch": 2.258578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.057211100450288946, "kl": 0.13506409525871277, "learning_rate": 1.7670019939210023e-07, "loss": 0.0013, "num_tokens": 58179919.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9647648334503174, "sampling/importance_sampling_ratio/mean": 1.0004273653030396, "sampling/importance_sampling_ratio/min": 0.5388311743736267, "sampling/sampling_logp_difference/max": 0.6753726005554199, "sampling/sampling_logp_difference/mean": 0.013093828223645687, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.22805750370025635, "epoch": 2.2598039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.7208807686495529, "kl": 0.1852080225944519, "learning_rate": 1.761571037288637e-07, "loss": -0.0331, "num_tokens": 58206191.0, "reward": -0.21875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000075101852417, "sampling/importance_sampling_ratio/min": 0.18973375856876373, "sampling/sampling_logp_difference/max": 1.6621334552764893, "sampling/sampling_logp_difference/mean": 0.014221054501831532, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 223.0625, "completions/mean_terminated_length": 223.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.318605899810791, "epoch": 2.261029411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.9910644140403233, "kl": 0.12354724109172821, "learning_rate": 1.7561466544619076e-07, "loss": -0.0009, "num_tokens": 58244115.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8127728700637817, "sampling/importance_sampling_ratio/mean": 1.0001894235610962, "sampling/importance_sampling_ratio/min": 0.29515746235847473, "sampling/sampling_logp_difference/max": 1.2202463150024414, "sampling/sampling_logp_difference/mean": 0.01690426468849182, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 161.515625, "completions/mean_terminated_length": 161.515625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.16774044930934906, "epoch": 2.2622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.1828802332373084, "kl": 0.06102241948246956, "learning_rate": 1.7507288564519646e-07, "loss": 0.0006, "num_tokens": 58268484.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7375820875167847, "sampling/importance_sampling_ratio/mean": 0.9996050596237183, "sampling/importance_sampling_ratio/min": 0.509118914604187, "sampling/sampling_logp_difference/max": 0.6750736236572266, "sampling/sampling_logp_difference/mean": 0.01106603629887104, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 244.546875, "completions/mean_terminated_length": 244.546875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.305085688829422, "epoch": 2.263480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.0515456690053895, "kl": 0.10104508697986603, "learning_rate": 1.7453176542565956e-07, "loss": 0.0106, "num_tokens": 58308855.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7150546312332153, "sampling/importance_sampling_ratio/mean": 1.000779151916504, "sampling/importance_sampling_ratio/min": 0.6202815771102905, "sampling/sampling_logp_difference/max": 0.5394449234008789, "sampling/sampling_logp_difference/mean": 0.014966677874326706, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 226.921875, "completions/mean_terminated_length": 226.921875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.27462872862815857, "epoch": 2.264705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 1.323226474562783, "kl": 0.0892459824681282, "learning_rate": 1.7399130588601968e-07, "loss": -0.0757, "num_tokens": 58348738.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999624490737915, "sampling/importance_sampling_ratio/min": 0.43417733907699585, "sampling/sampling_logp_difference/max": 0.8811070919036865, "sampling/sampling_logp_difference/mean": 0.015549260191619396, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 243.796875, "completions/mean_terminated_length": 243.796875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.21817326545715332, "epoch": 2.2659313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.179961487129024, "kl": 0.07068933546543121, "learning_rate": 1.7345150812337562e-07, "loss": 0.0101, "num_tokens": 58382693.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4933048486709595, "sampling/importance_sampling_ratio/mean": 1.0000038146972656, "sampling/importance_sampling_ratio/min": 0.6117191910743713, "sampling/sampling_logp_difference/max": 0.4914819598197937, "sampling/sampling_logp_difference/mean": 0.012668643146753311, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 333.515625, "completions/mean_terminated_length": 333.515625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3151954710483551, "epoch": 2.267156862745098, "frac_reward_zero_std": 0.0, "grad_norm": 1.5045878073237393, "kl": 0.12498971819877625, "learning_rate": 1.7291237323348284e-07, "loss": 0.0304, "num_tokens": 58420150.0, "reward": 0.3125, "reward_std": 0.843070387840271, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.709483027458191, "sampling/importance_sampling_ratio/mean": 1.000072717666626, "sampling/importance_sampling_ratio/min": 0.2756160795688629, "sampling/sampling_logp_difference/max": 1.2887464761734009, "sampling/sampling_logp_difference/mean": 0.014403359033167362, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 202.65625, "completions/mean_terminated_length": 202.65625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.22938711941242218, "epoch": 2.2683823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.12281090556332791, "kl": 0.06494831293821335, "learning_rate": 1.7237390231075055e-07, "loss": 0.0006, "num_tokens": 58456352.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004684925079346, "sampling/importance_sampling_ratio/min": 0.5263667106628418, "sampling/sampling_logp_difference/max": 0.751194953918457, "sampling/sampling_logp_difference/mean": 0.015168210491538048, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 204.734375, "completions/mean_terminated_length": 204.734375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.26925528049468994, "epoch": 2.269607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.491373261845078, "kl": 0.08605523407459259, "learning_rate": 1.7183609644824092e-07, "loss": 0.0395, "num_tokens": 58488335.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000238418579102, "sampling/importance_sampling_ratio/min": 0.5362949371337891, "sampling/sampling_logp_difference/max": 0.8254508972167969, "sampling/sampling_logp_difference/mean": 0.015716159716248512, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 259.296875, "completions/mean_terminated_length": 259.296875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3036905825138092, "epoch": 2.2708333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.086729544282273, "kl": 0.14409559965133667, "learning_rate": 1.7129895673766575e-07, "loss": -0.0287, "num_tokens": 58521794.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 0.9997403621673584, "sampling/importance_sampling_ratio/min": 0.5336452722549438, "sampling/sampling_logp_difference/max": 0.6280239820480347, "sampling/sampling_logp_difference/mean": 0.014530260115861893, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 219.9375, "completions/mean_terminated_length": 219.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.20145106315612793, "epoch": 2.2720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.04183860565817975, "kl": 0.06628386676311493, "learning_rate": 1.707624842693844e-07, "loss": 0.0006, "num_tokens": 58561774.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8742541074752808, "sampling/importance_sampling_ratio/mean": 0.9997987151145935, "sampling/importance_sampling_ratio/min": 0.47087979316711426, "sampling/sampling_logp_difference/max": 0.7531524300575256, "sampling/sampling_logp_difference/mean": 0.012259380891919136, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.26910310983657837, "epoch": 2.2732843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.1433446537160958, "kl": 0.12811408936977386, "learning_rate": 1.7022668013240227e-07, "loss": 0.053, "num_tokens": 58593390.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4334418773651123, "sampling/importance_sampling_ratio/mean": 0.9996029138565063, "sampling/importance_sampling_ratio/min": 0.6056228876113892, "sampling/sampling_logp_difference/max": 0.501497745513916, "sampling/sampling_logp_difference/mean": 0.013298267498612404, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 171.453125, "completions/mean_terminated_length": 171.453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2976323068141937, "epoch": 2.2745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.046461394156045846, "kl": 0.08296193182468414, "learning_rate": 1.696915454143676e-07, "loss": 0.0009, "num_tokens": 58620891.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9882467985153198, "sampling/importance_sampling_ratio/mean": 1.000699520111084, "sampling/importance_sampling_ratio/min": 0.6074444055557251, "sampling/sampling_logp_difference/max": 0.6872532367706299, "sampling/sampling_logp_difference/mean": 0.015259575098752975, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 216.765625, "completions/mean_terminated_length": 216.765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2673759460449219, "epoch": 2.275735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.1585872761982534, "kl": 0.09247661381959915, "learning_rate": 1.691570812015704e-07, "loss": -0.0115, "num_tokens": 58653580.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6603155136108398, "sampling/importance_sampling_ratio/mean": 1.0005006790161133, "sampling/importance_sampling_ratio/min": 0.6128051280975342, "sampling/sampling_logp_difference/max": 0.5070075988769531, "sampling/sampling_logp_difference/mean": 0.014402111992239952, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.23475946485996246, "epoch": 2.2769607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.2278866171460356, "kl": 0.07313594222068787, "learning_rate": 1.6862328857893855e-07, "loss": -0.0231, "num_tokens": 58681596.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7771425247192383, "sampling/importance_sampling_ratio/mean": 1.0007569789886475, "sampling/importance_sampling_ratio/min": 0.5250598788261414, "sampling/sampling_logp_difference/max": 0.6442430019378662, "sampling/sampling_logp_difference/mean": 0.014543603174388409, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 232.15625, "completions/mean_terminated_length": 232.15625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.30302685499191284, "epoch": 2.278186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.2608571469594911, "kl": 0.10338533669710159, "learning_rate": 1.680901686300376e-07, "loss": -0.0276, "num_tokens": 58718358.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5953242778778076, "sampling/importance_sampling_ratio/mean": 0.9996898770332336, "sampling/importance_sampling_ratio/min": 0.512139618396759, "sampling/sampling_logp_difference/max": 0.6691579818725586, "sampling/sampling_logp_difference/mean": 0.016415957361459732, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 208.25, "completions/mean_terminated_length": 208.25, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.29900407791137695, "epoch": 2.2794117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.7167735694978379, "kl": 0.0868486762046814, "learning_rate": 1.6755772243706712e-07, "loss": 0.0286, "num_tokens": 58749542.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8058955669403076, "sampling/importance_sampling_ratio/mean": 1.0002375841140747, "sampling/importance_sampling_ratio/min": 0.48686483502388, "sampling/sampling_logp_difference/max": 0.719768762588501, "sampling/sampling_logp_difference/mean": 0.015454989857971668, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 212.6875, "completions/mean_terminated_length": 212.6875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2853885889053345, "epoch": 2.280637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 2.1323403182520444, "kl": 0.11518781632184982, "learning_rate": 1.6702595108085942e-07, "loss": 0.0141, "num_tokens": 58784290.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004379749298096, "sampling/importance_sampling_ratio/min": 0.42395901679992676, "sampling/sampling_logp_difference/max": 4.34964656829834, "sampling/sampling_logp_difference/mean": 0.01463394146412611, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 228.96875, "completions/mean_terminated_length": 228.96875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.31048333644866943, "epoch": 2.281862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.115790642059336, "kl": 0.08031786978244781, "learning_rate": 1.6649485564087644e-07, "loss": 0.0033, "num_tokens": 58819760.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6257354021072388, "sampling/importance_sampling_ratio/mean": 0.999483048915863, "sampling/importance_sampling_ratio/min": 0.4408201575279236, "sampling/sampling_logp_difference/max": 0.8191182613372803, "sampling/sampling_logp_difference/mean": 0.016958583146333694, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 208.4375, "completions/mean_terminated_length": 208.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2439727485179901, "epoch": 2.2830882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.5425184646331114, "kl": 0.08408606052398682, "learning_rate": 1.6596443719520826e-07, "loss": -0.0401, "num_tokens": 58849420.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6181564331054688, "sampling/importance_sampling_ratio/mean": 0.9995671510696411, "sampling/importance_sampling_ratio/min": 0.6246147751808167, "sampling/sampling_logp_difference/max": 0.48128747940063477, "sampling/sampling_logp_difference/mean": 0.014026173390448093, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 187.640625, "completions/mean_terminated_length": 187.640625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.19454039633274078, "epoch": 2.284313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0376193048901173, "kl": 0.06733225286006927, "learning_rate": 1.6543469682057104e-07, "loss": 0.0007, "num_tokens": 58878197.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5423756837844849, "sampling/importance_sampling_ratio/mean": 0.9997547268867493, "sampling/importance_sampling_ratio/min": 0.6241891980171204, "sampling/sampling_logp_difference/max": 0.471301794052124, "sampling/sampling_logp_difference/mean": 0.012020319700241089, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 206.15625, "completions/mean_terminated_length": 206.15625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.21993784606456757, "epoch": 2.2855392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.2506955861577824, "kl": 0.08632819354534149, "learning_rate": 1.6490563559230357e-07, "loss": 0.0059, "num_tokens": 58906255.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5340826511383057, "sampling/importance_sampling_ratio/mean": 0.9997017979621887, "sampling/importance_sampling_ratio/min": 0.5940049886703491, "sampling/sampling_logp_difference/max": 0.5208675861358643, "sampling/sampling_logp_difference/mean": 0.012924928218126297, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 219.390625, "completions/mean_terminated_length": 219.390625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.323246031999588, "epoch": 2.286764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.4698833468616563, "kl": 0.11695494502782822, "learning_rate": 1.6437725458436725e-07, "loss": -0.0174, "num_tokens": 58937480.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6660001277923584, "sampling/importance_sampling_ratio/mean": 0.9996105432510376, "sampling/importance_sampling_ratio/min": 0.5110235810279846, "sampling/sampling_logp_difference/max": 0.6713396310806274, "sampling/sampling_logp_difference/mean": 0.01686575450003147, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 191.171875, "completions/mean_terminated_length": 191.171875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.13237124681472778, "epoch": 2.2879901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.026347006718303, "kl": 0.06299218535423279, "learning_rate": 1.6384955486934154e-07, "loss": 0.0087, "num_tokens": 58964979.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6238988637924194, "sampling/importance_sampling_ratio/mean": 1.000060796737671, "sampling/importance_sampling_ratio/min": 0.6016948223114014, "sampling/sampling_logp_difference/max": 0.508004903793335, "sampling/sampling_logp_difference/mean": 0.008998863399028778, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 202.359375, "completions/mean_terminated_length": 202.359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.198871448636055, "epoch": 2.2892156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.06890909192266657, "kl": 0.06596966087818146, "learning_rate": 1.633225375184239e-07, "loss": 0.0007, "num_tokens": 58995066.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.949591040611267, "sampling/importance_sampling_ratio/mean": 1.000196099281311, "sampling/importance_sampling_ratio/min": 0.4717702269554138, "sampling/sampling_logp_difference/max": 0.7512632608413696, "sampling/sampling_logp_difference/mean": 0.013516712002456188, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 185.09375, "completions/mean_terminated_length": 185.09375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.22761297225952148, "epoch": 2.2904411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.04666079638369761, "kl": 0.06524749845266342, "learning_rate": 1.6279620360142594e-07, "loss": 0.0007, "num_tokens": 59020272.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6944836378097534, "sampling/importance_sampling_ratio/mean": 1.0000618696212769, "sampling/importance_sampling_ratio/min": 0.42703554034233093, "sampling/sampling_logp_difference/max": 0.8508880138397217, "sampling/sampling_logp_difference/mean": 0.014888203702867031, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 166.828125, "completions/mean_terminated_length": 166.828125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.25784143805503845, "epoch": 2.2916666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 1.4682644214754028, "kl": 0.09768915921449661, "learning_rate": 1.62270554186772e-07, "loss": 0.042, "num_tokens": 59045173.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5245864391326904, "sampling/importance_sampling_ratio/mean": 0.9994574785232544, "sampling/importance_sampling_ratio/min": 0.5124073028564453, "sampling/sampling_logp_difference/max": 0.668635368347168, "sampling/sampling_logp_difference/mean": 0.014561614021658897, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 218.203125, "completions/mean_terminated_length": 218.203125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2707987427711487, "epoch": 2.292892156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.0328097232178153, "kl": 0.09174228459596634, "learning_rate": 1.6174559034149737e-07, "loss": -0.0152, "num_tokens": 59077506.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4908168315887451, "sampling/importance_sampling_ratio/mean": 0.9995948076248169, "sampling/importance_sampling_ratio/min": 0.5163322687149048, "sampling/sampling_logp_difference/max": 0.6610047817230225, "sampling/sampling_logp_difference/mean": 0.015276968479156494, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 219.9375, "completions/mean_terminated_length": 219.9375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2424817532300949, "epoch": 2.2941176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.9338241687311134, "kl": 0.07496918737888336, "learning_rate": 1.6122131313124538e-07, "loss": -0.0297, "num_tokens": 59110350.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000536441802979, "sampling/importance_sampling_ratio/min": 0.30553194880485535, "sampling/sampling_logp_difference/max": 1.1857008934020996, "sampling/sampling_logp_difference/mean": 0.014451341703534126, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 228.171875, "completions/mean_terminated_length": 228.171875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.31136059761047363, "epoch": 2.295343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.016010546987199, "kl": 0.11611990630626678, "learning_rate": 1.606977236202654e-07, "loss": -0.0233, "num_tokens": 59140937.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6187459230422974, "sampling/importance_sampling_ratio/mean": 0.9999882578849792, "sampling/importance_sampling_ratio/min": 0.5052246451377869, "sampling/sampling_logp_difference/max": 0.6827521324157715, "sampling/sampling_logp_difference/mean": 0.015141132287681103, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 179.921875, "completions/mean_terminated_length": 179.921875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2291935831308365, "epoch": 2.2965686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.435881528477487, "kl": 0.10278773307800293, "learning_rate": 1.6017482287141088e-07, "loss": 0.0051, "num_tokens": 59167844.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.9217751026153564, "sampling/importance_sampling_ratio/mean": 0.9994111061096191, "sampling/importance_sampling_ratio/min": 0.4882408678531647, "sampling/sampling_logp_difference/max": 0.7169463634490967, "sampling/sampling_logp_difference/mean": 0.013567205518484116, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 229.34375, "completions/mean_terminated_length": 229.34375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.31835418939590454, "epoch": 2.297794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.8047133088676053, "kl": 0.12889903783798218, "learning_rate": 1.5965261194613755e-07, "loss": 0.0596, "num_tokens": 59197994.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.9140546321868896, "sampling/importance_sampling_ratio/mean": 1.0005314350128174, "sampling/importance_sampling_ratio/min": 0.5223632454872131, "sampling/sampling_logp_difference/max": 0.6493921279907227, "sampling/sampling_logp_difference/mean": 0.016464825719594955, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 225.65625, "completions/mean_terminated_length": 225.65625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.35226359963417053, "epoch": 2.299019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.7955400617526363, "kl": 0.13876409828662872, "learning_rate": 1.591310919045003e-07, "loss": -0.0136, "num_tokens": 59228084.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6346104145050049, "sampling/importance_sampling_ratio/mean": 1.0000537633895874, "sampling/importance_sampling_ratio/min": 0.475128173828125, "sampling/sampling_logp_difference/max": 0.7441706657409668, "sampling/sampling_logp_difference/mean": 0.019753381609916687, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 252.6875, "completions/mean_terminated_length": 252.6875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.27226123213768005, "epoch": 2.3002450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.3958501420863585, "kl": 0.06177687272429466, "learning_rate": 1.5861026380515163e-07, "loss": -0.0232, "num_tokens": 59259760.0, "reward": -0.5625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": -0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.771236538887024, "sampling/importance_sampling_ratio/mean": 1.0004510879516602, "sampling/importance_sampling_ratio/min": 0.48663097620010376, "sampling/sampling_logp_difference/max": 0.7202491760253906, "sampling/sampling_logp_difference/mean": 0.014677229337394238, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 295.453125, "completions/mean_terminated_length": 295.453125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.283691942691803, "epoch": 2.301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.9297130937338883, "kl": 0.07495174556970596, "learning_rate": 1.5809012870533995e-07, "loss": -0.0029, "num_tokens": 59296525.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7762110233306885, "sampling/importance_sampling_ratio/mean": 0.9998952150344849, "sampling/importance_sampling_ratio/min": 0.48336297273635864, "sampling/sampling_logp_difference/max": 0.7269874811172485, "sampling/sampling_logp_difference/mean": 0.014811035245656967, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 223.765625, "completions/mean_terminated_length": 223.765625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.23727725446224213, "epoch": 2.3026960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.8755819010904231, "kl": 0.06590835005044937, "learning_rate": 1.575706876609063e-07, "loss": 0.0146, "num_tokens": 59332974.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6210137605667114, "sampling/importance_sampling_ratio/mean": 1.0001112222671509, "sampling/importance_sampling_ratio/min": 0.2855037748813629, "sampling/sampling_logp_difference/max": 1.2535001039505005, "sampling/sampling_logp_difference/mean": 0.013454330153763294, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 263.46875, "completions/mean_terminated_length": 263.46875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2924033999443054, "epoch": 2.303921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.6013525803897344, "kl": 0.09922477602958679, "learning_rate": 1.5705194172628323e-07, "loss": 0.0914, "num_tokens": 59372604.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 0.9998865127563477, "sampling/importance_sampling_ratio/min": 0.624093770980835, "sampling/sampling_logp_difference/max": 0.4985384941101074, "sampling/sampling_logp_difference/mean": 0.01429499126970768, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26727503538131714, "epoch": 2.3051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.1899354246291431, "kl": 0.11983922123908997, "learning_rate": 1.565338919544918e-07, "loss": 0.0031, "num_tokens": 59408364.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5612788200378418, "sampling/importance_sampling_ratio/mean": 0.9999018907546997, "sampling/importance_sampling_ratio/min": 0.5402755737304688, "sampling/sampling_logp_difference/max": 0.6156759262084961, "sampling/sampling_logp_difference/mean": 0.015409836545586586, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3307681381702423, "epoch": 2.306372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.9291671873146752, "kl": 0.13036750257015228, "learning_rate": 1.5601653939714072e-07, "loss": -0.0016, "num_tokens": 59449068.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002212524414062, "sampling/importance_sampling_ratio/min": 0.3798368573188782, "sampling/sampling_logp_difference/max": 0.9680135250091553, "sampling/sampling_logp_difference/mean": 0.016231603920459747, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 179.6875, "completions/mean_terminated_length": 179.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2324448525905609, "epoch": 2.3075980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.04347912088849565, "kl": 0.06620276719331741, "learning_rate": 1.5549988510442258e-07, "loss": 0.0007, "num_tokens": 59479784.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.826905369758606, "sampling/importance_sampling_ratio/mean": 1.0006721019744873, "sampling/importance_sampling_ratio/min": 0.4835604727268219, "sampling/sampling_logp_difference/max": 0.726578950881958, "sampling/sampling_logp_difference/mean": 0.013993888162076473, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 216.109375, "completions/mean_terminated_length": 216.109375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.25666937232017517, "epoch": 2.3088235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.3402557592021203, "kl": 0.0841069370508194, "learning_rate": 1.5498393012511285e-07, "loss": -0.0154, "num_tokens": 59510287.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995979070663452, "sampling/importance_sampling_ratio/min": 0.41927775740623474, "sampling/sampling_logp_difference/max": 0.8762650489807129, "sampling/sampling_logp_difference/mean": 0.01526167057454586, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 190.859375, "completions/mean_terminated_length": 190.859375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2237476408481598, "epoch": 2.310049019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.5917239669146777, "kl": 0.12006629258394241, "learning_rate": 1.5446867550656767e-07, "loss": 0.0063, "num_tokens": 59536566.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.597978949546814, "sampling/importance_sampling_ratio/mean": 1.000736951828003, "sampling/importance_sampling_ratio/min": 0.5315966010093689, "sampling/sampling_logp_difference/max": 0.6318703889846802, "sampling/sampling_logp_difference/mean": 0.013502996414899826, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 190.4375, "completions/mean_terminated_length": 190.4375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.23544517159461975, "epoch": 2.311274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 1.3060829679562909, "kl": 0.08534479141235352, "learning_rate": 1.5395412229472103e-07, "loss": -0.0123, "num_tokens": 59572738.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5971856117248535, "sampling/importance_sampling_ratio/mean": 0.9998691082000732, "sampling/importance_sampling_ratio/min": 0.2696689963340759, "sampling/sampling_logp_difference/max": 1.3105599880218506, "sampling/sampling_logp_difference/mean": 0.014193758368492126, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 170.8125, "completions/mean_terminated_length": 170.8125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.19557109475135803, "epoch": 2.3125, "frac_reward_zero_std": 0.75, "grad_norm": 2.403418451183364, "kl": 0.07334604859352112, "learning_rate": 1.5344027153408374e-07, "loss": -0.0423, "num_tokens": 59611174.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7779353857040405, "sampling/importance_sampling_ratio/mean": 1.0002080202102661, "sampling/importance_sampling_ratio/min": 0.45178794860839844, "sampling/sampling_logp_difference/max": 0.7945423126220703, "sampling/sampling_logp_difference/mean": 0.013376087881624699, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 201.3125, "completions/mean_terminated_length": 201.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.21097078919410706, "epoch": 2.313725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.2451653351307033, "kl": 0.08662876486778259, "learning_rate": 1.5292712426773973e-07, "loss": 0.0033, "num_tokens": 59639706.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997841715812683, "sampling/importance_sampling_ratio/min": 0.3311176896095276, "sampling/sampling_logp_difference/max": 1.2261834144592285, "sampling/sampling_logp_difference/mean": 0.01257825456559658, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 203.359375, "completions/mean_terminated_length": 203.359375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.17383569478988647, "epoch": 2.314950980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03679861989617035, "kl": 0.054777417331933975, "learning_rate": 1.5241468153734594e-07, "loss": 0.0005, "num_tokens": 59678385.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5678654909133911, "sampling/importance_sampling_ratio/mean": 0.9997862577438354, "sampling/importance_sampling_ratio/min": 0.6043789982795715, "sampling/sampling_logp_difference/max": 0.5035538673400879, "sampling/sampling_logp_difference/mean": 0.0109081557020545, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 159.453125, "completions/mean_terminated_length": 159.453125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2851375639438629, "epoch": 2.3161764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.09114244957311834, "kl": 0.17656400799751282, "learning_rate": 1.5190294438312834e-07, "loss": 0.0018, "num_tokens": 59704062.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007810831069946, "sampling/importance_sampling_ratio/mean": 0.9995872378349304, "sampling/importance_sampling_ratio/min": 0.29912951588630676, "sampling/sampling_logp_difference/max": 1.206878662109375, "sampling/sampling_logp_difference/mean": 0.015117294155061245, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 147.703125, "completions/mean_terminated_length": 147.703125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.24446986615657806, "epoch": 2.3174019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 2.232726297540076, "kl": 0.16684526205062866, "learning_rate": 1.5139191384388094e-07, "loss": -0.0017, "num_tokens": 59728075.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004994869232178, "sampling/importance_sampling_ratio/min": 0.6260403990745544, "sampling/sampling_logp_difference/max": 0.7444643974304199, "sampling/sampling_logp_difference/mean": 0.014580121263861656, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 237.421875, "completions/mean_terminated_length": 237.421875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.15843701362609863, "epoch": 2.318627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.02920837417083462, "kl": 0.039530105888843536, "learning_rate": 1.5088159095696362e-07, "loss": 0.0004, "num_tokens": 59758310.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6272822618484497, "sampling/importance_sampling_ratio/mean": 1.0002012252807617, "sampling/importance_sampling_ratio/min": 0.6125374436378479, "sampling/sampling_logp_difference/max": 0.490145206451416, "sampling/sampling_logp_difference/mean": 0.009524786844849586, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 215.734375, "completions/mean_terminated_length": 215.734375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1598864495754242, "epoch": 2.3198529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.331654909496703, "kl": 0.06764960289001465, "learning_rate": 1.5037197675829916e-07, "loss": 0.0389, "num_tokens": 59793061.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7586230039596558, "sampling/importance_sampling_ratio/mean": 1.0005803108215332, "sampling/importance_sampling_ratio/min": 0.6109176874160767, "sampling/sampling_logp_difference/max": 0.5645310878753662, "sampling/sampling_logp_difference/mean": 0.009833090007305145, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 214.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.24242526292800903, "epoch": 2.321078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.039675668582658, "kl": 0.07931549847126007, "learning_rate": 1.4986307228237267e-07, "loss": -0.0253, "num_tokens": 59830637.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6120399236679077, "sampling/importance_sampling_ratio/mean": 1.0000349283218384, "sampling/importance_sampling_ratio/min": 0.5739162564277649, "sampling/sampling_logp_difference/max": 0.5552718639373779, "sampling/sampling_logp_difference/mean": 0.013587141409516335, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 194.09375, "completions/mean_terminated_length": 194.09375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.28710392117500305, "epoch": 2.3223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.3389509409273923, "kl": 0.09452906250953674, "learning_rate": 1.4935487856222723e-07, "loss": -0.0157, "num_tokens": 59863475.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6049695014953613, "sampling/importance_sampling_ratio/mean": 0.99993896484375, "sampling/importance_sampling_ratio/min": 0.30557137727737427, "sampling/sampling_logp_difference/max": 1.1855719089508057, "sampling/sampling_logp_difference/mean": 0.015732331201434135, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 216.421875, "completions/mean_terminated_length": 216.421875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2689727544784546, "epoch": 2.323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.3257165466554364, "kl": 0.08077120780944824, "learning_rate": 1.4884739662946445e-07, "loss": 0.0103, "num_tokens": 59898302.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999836266040802, "sampling/importance_sampling_ratio/min": 0.20456689596176147, "sampling/sampling_logp_difference/max": 1.586860179901123, "sampling/sampling_logp_difference/mean": 0.016044773161411285, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 197.765625, "completions/mean_terminated_length": 197.765625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.24810859560966492, "epoch": 2.3247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.6701793822172892, "kl": 0.09056637436151505, "learning_rate": 1.4834062751424015e-07, "loss": 0.0104, "num_tokens": 59931279.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4412437677383423, "sampling/importance_sampling_ratio/mean": 0.9997663497924805, "sampling/importance_sampling_ratio/min": 0.3919161856174469, "sampling/sampling_logp_difference/max": 0.9367072582244873, "sampling/sampling_logp_difference/mean": 0.01493392325937748, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 228.4375, "completions/mean_terminated_length": 228.4375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.22810769081115723, "epoch": 2.325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.026946671836266173, "kl": 0.04421919584274292, "learning_rate": 1.478345722452639e-07, "loss": 0.0004, "num_tokens": 59962059.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6555689573287964, "sampling/importance_sampling_ratio/mean": 1.0000898838043213, "sampling/importance_sampling_ratio/min": 0.3941422700881958, "sampling/sampling_logp_difference/max": 0.9310433864593506, "sampling/sampling_logp_difference/mean": 0.013491123914718628, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.25881195068359375, "epoch": 2.327205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.7893686702587364, "kl": 0.08352837711572647, "learning_rate": 1.4732923184979562e-07, "loss": 0.0252, "num_tokens": 59991067.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4388809204101562, "sampling/importance_sampling_ratio/mean": 1.0000529289245605, "sampling/importance_sampling_ratio/min": 0.5265589356422424, "sampling/sampling_logp_difference/max": 0.6413919925689697, "sampling/sampling_logp_difference/mean": 0.013214487582445145, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2558945119380951, "epoch": 2.3284313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.4142773353536864, "kl": 0.08801299333572388, "learning_rate": 1.4682460735364422e-07, "loss": 0.0054, "num_tokens": 60019115.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5824116468429565, "sampling/importance_sampling_ratio/mean": 0.9999874234199524, "sampling/importance_sampling_ratio/min": 0.5375378131866455, "sampling/sampling_logp_difference/max": 0.6207561492919922, "sampling/sampling_logp_difference/mean": 0.014198105782270432, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 159.84375, "completions/mean_terminated_length": 159.84375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.18878209590911865, "epoch": 2.329656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.052108689172164484, "kl": 0.05853067338466644, "learning_rate": 1.4632069978116584e-07, "loss": 0.0005, "num_tokens": 60046369.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007681846618652, "sampling/importance_sampling_ratio/min": 0.5236412882804871, "sampling/sampling_logp_difference/max": 0.9863035678863525, "sampling/sampling_logp_difference/mean": 0.012843549251556396, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 204.3125, "completions/mean_terminated_length": 204.3125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.24379225075244904, "epoch": 2.3308823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.10996350721454363, "kl": 0.09064918756484985, "learning_rate": 1.4581751015526033e-07, "loss": 0.001, "num_tokens": 60076453.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995670318603516, "sampling/importance_sampling_ratio/min": 0.2191912680864334, "sampling/sampling_logp_difference/max": 1.517810583114624, "sampling/sampling_logp_difference/mean": 0.014508318156003952, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.23957127332687378, "epoch": 2.332107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.5767839733834388, "kl": 0.12310803681612015, "learning_rate": 1.4531503949737106e-07, "loss": 0.0196, "num_tokens": 60108045.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5311764478683472, "sampling/importance_sampling_ratio/mean": 1.000032901763916, "sampling/importance_sampling_ratio/min": 0.13440155982971191, "sampling/sampling_logp_difference/max": 2.006923198699951, "sampling/sampling_logp_difference/mean": 0.013991523534059525, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 224.21875, "completions/mean_terminated_length": 224.21875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.21522961556911469, "epoch": 2.3333333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.6215186486844329, "kl": 0.10068541765213013, "learning_rate": 1.4481328882748184e-07, "loss": 0.0134, "num_tokens": 60137611.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.757907748222351, "sampling/importance_sampling_ratio/mean": 1.0003650188446045, "sampling/importance_sampling_ratio/min": 0.4751411974430084, "sampling/sampling_logp_difference/max": 0.7441432476043701, "sampling/sampling_logp_difference/mean": 0.012857379391789436, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 244.828125, "completions/mean_terminated_length": 244.828125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2058878242969513, "epoch": 2.3345588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.9255789119409037, "kl": 0.057641707360744476, "learning_rate": 1.4431225916411455e-07, "loss": -0.026, "num_tokens": 60168128.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.9002087116241455, "sampling/importance_sampling_ratio/mean": 0.9999744296073914, "sampling/importance_sampling_ratio/min": 0.5323302149772644, "sampling/sampling_logp_difference/max": 0.6419637203216553, "sampling/sampling_logp_difference/mean": 0.012306640855967999, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.734375, "completions/mean_terminated_length": 169.734375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.253532350063324, "epoch": 2.3357843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.13391592878802164, "kl": 0.11147284507751465, "learning_rate": 1.4381195152432769e-07, "loss": 0.0011, "num_tokens": 60201119.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997863173484802, "sampling/importance_sampling_ratio/min": 0.2298680990934372, "sampling/sampling_logp_difference/max": 1.4702496528625488, "sampling/sampling_logp_difference/mean": 0.016630372032523155, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.16174277663230896, "epoch": 2.3370098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.8753449133823992, "kl": 0.07965998351573944, "learning_rate": 1.4331236692371384e-07, "loss": 0.0643, "num_tokens": 60226479.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 0.999763548374176, "sampling/importance_sampling_ratio/min": 0.5685030817985535, "sampling/sampling_logp_difference/max": 0.5647485256195068, "sampling/sampling_logp_difference/mean": 0.010479824617505074, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 176.234375, "completions/mean_terminated_length": 176.234375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.26102542877197266, "epoch": 2.338235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.0885321248888284, "kl": 0.07639878243207932, "learning_rate": 1.428135063763985e-07, "loss": -0.0041, "num_tokens": 60259150.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6030136346817017, "sampling/importance_sampling_ratio/mean": 1.0001156330108643, "sampling/importance_sampling_ratio/min": 0.5586609840393066, "sampling/sampling_logp_difference/max": 0.5822124481201172, "sampling/sampling_logp_difference/mean": 0.016059918329119682, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 198.234375, "completions/mean_terminated_length": 198.234375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.23330505192279816, "epoch": 2.3394607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.8393192967961558, "kl": 0.07623052597045898, "learning_rate": 1.4231537089503675e-07, "loss": -0.0249, "num_tokens": 60290381.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8180720806121826, "sampling/importance_sampling_ratio/mean": 0.9994747638702393, "sampling/importance_sampling_ratio/min": 0.6151342391967773, "sampling/sampling_logp_difference/max": 0.5977766513824463, "sampling/sampling_logp_difference/mean": 0.014114055782556534, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 258.6875, "completions/mean_terminated_length": 258.6875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.27300840616226196, "epoch": 2.340686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.0987861380741548, "kl": 0.060729093849658966, "learning_rate": 1.4181796149081194e-07, "loss": -0.0326, "num_tokens": 60327785.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.644620418548584, "sampling/importance_sampling_ratio/mean": 1.0000439882278442, "sampling/importance_sampling_ratio/min": 0.46871840953826904, "sampling/sampling_logp_difference/max": 0.7577531337738037, "sampling/sampling_logp_difference/mean": 0.014157635159790516, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.21560019254684448, "epoch": 2.3419117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.417307635747955, "kl": 0.12530824542045593, "learning_rate": 1.4132127917343394e-07, "loss": 0.0846, "num_tokens": 60357441.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002789497375488, "sampling/importance_sampling_ratio/min": 0.49835866689682007, "sampling/sampling_logp_difference/max": 0.8950405120849609, "sampling/sampling_logp_difference/mean": 0.012443248182535172, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 190.53125, "completions/mean_terminated_length": 190.53125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.28844764828681946, "epoch": 2.343137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.7136256914682377, "kl": 0.09305107593536377, "learning_rate": 1.4082532495113624e-07, "loss": 0.0809, "num_tokens": 60386579.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5612086057662964, "sampling/importance_sampling_ratio/mean": 0.9995530843734741, "sampling/importance_sampling_ratio/min": 0.4292699098587036, "sampling/sampling_logp_difference/max": 0.8456693887710571, "sampling/sampling_logp_difference/mean": 0.014590008184313774, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 223.234375, "completions/mean_terminated_length": 223.234375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.19692984223365784, "epoch": 2.344362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.04324036073446108, "kl": 0.05780654400587082, "learning_rate": 1.4033009983067452e-07, "loss": 0.0006, "num_tokens": 60420386.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7776691913604736, "sampling/importance_sampling_ratio/mean": 1.0006442070007324, "sampling/importance_sampling_ratio/min": 0.5536872148513794, "sampling/sampling_logp_difference/max": 0.5911552906036377, "sampling/sampling_logp_difference/mean": 0.010026085190474987, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 197.3125, "completions/mean_terminated_length": 197.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.27616995573043823, "epoch": 2.3455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.419558332972906, "kl": 0.1510027050971985, "learning_rate": 1.398356048173242e-07, "loss": 0.0202, "num_tokens": 60453222.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8812905550003052, "sampling/importance_sampling_ratio/mean": 0.9990477561950684, "sampling/importance_sampling_ratio/min": 0.5322301387786865, "sampling/sampling_logp_difference/max": 0.6319580078125, "sampling/sampling_logp_difference/mean": 0.016108594834804535, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 221.609375, "completions/mean_terminated_length": 221.609375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2639700770378113, "epoch": 2.346813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.0590373478268813, "kl": 0.08162802457809448, "learning_rate": 1.3934184091487915e-07, "loss": -0.0061, "num_tokens": 60481821.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999839186668396, "sampling/importance_sampling_ratio/min": 0.5718647837638855, "sampling/sampling_logp_difference/max": 0.7172503471374512, "sampling/sampling_logp_difference/mean": 0.012737276032567024, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 164.171875, "completions/mean_terminated_length": 164.171875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22160595655441284, "epoch": 2.3480392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.04540446260471624, "kl": 0.06879030168056488, "learning_rate": 1.3884880912564873e-07, "loss": 0.0007, "num_tokens": 60508328.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9665645360946655, "sampling/importance_sampling_ratio/mean": 1.000072717666626, "sampling/importance_sampling_ratio/min": 0.5566588044166565, "sampling/sampling_logp_difference/max": 0.6762881278991699, "sampling/sampling_logp_difference/mean": 0.012744169682264328, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 180.5625, "completions/mean_terminated_length": 180.5625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22264420986175537, "epoch": 2.349264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.050170777608224214, "kl": 0.07316240668296814, "learning_rate": 1.3835651045045598e-07, "loss": 0.0007, "num_tokens": 60533996.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4916131496429443, "sampling/importance_sampling_ratio/mean": 1.0000035762786865, "sampling/importance_sampling_ratio/min": 0.5367288589477539, "sampling/sampling_logp_difference/max": 0.6222622394561768, "sampling/sampling_logp_difference/mean": 0.012823224067687988, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.22194664180278778, "epoch": 2.3504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.7970809259858578, "kl": 0.0707157701253891, "learning_rate": 1.3786494588863633e-07, "loss": 0.0615, "num_tokens": 60574804.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9481863975524902, "sampling/importance_sampling_ratio/mean": 0.999103307723999, "sampling/importance_sampling_ratio/min": 0.5347524881362915, "sampling/sampling_logp_difference/max": 0.6668989658355713, "sampling/sampling_logp_difference/mean": 0.012659601867198944, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 222.296875, "completions/mean_terminated_length": 222.296875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2153092473745346, "epoch": 2.3517156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.214353351093163, "kl": 0.06867530941963196, "learning_rate": 1.3737411643803448e-07, "loss": -0.0732, "num_tokens": 60607047.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8346688747406006, "sampling/importance_sampling_ratio/mean": 0.9999586343765259, "sampling/importance_sampling_ratio/min": 0.6097875833511353, "sampling/sampling_logp_difference/max": 0.6068639755249023, "sampling/sampling_logp_difference/mean": 0.013082252815365791, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 234.03125, "completions/mean_terminated_length": 234.03125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2702324390411377, "epoch": 2.3529411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.0443638998213056, "kl": 0.06359662860631943, "learning_rate": 1.368840230950035e-07, "loss": 0.0009, "num_tokens": 60641929.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5788229703903198, "sampling/importance_sampling_ratio/mean": 0.9997637867927551, "sampling/importance_sampling_ratio/min": 0.5266174674034119, "sampling/sampling_logp_difference/max": 0.6412808895111084, "sampling/sampling_logp_difference/mean": 0.015107502229511738, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 179.140625, "completions/mean_terminated_length": 179.140625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.23998671770095825, "epoch": 2.3541666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0589424191865848, "kl": 0.060683056712150574, "learning_rate": 1.3639466685440132e-07, "loss": 0.0006, "num_tokens": 60671042.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000229001045227, "sampling/importance_sampling_ratio/min": 0.5057139992713928, "sampling/sampling_logp_difference/max": 0.7327079772949219, "sampling/sampling_logp_difference/mean": 0.01452082209289074, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.22048285603523254, "epoch": 2.355392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.050767824219279896, "kl": 0.07472100853919983, "learning_rate": 1.3590604870959043e-07, "loss": 0.0008, "num_tokens": 60696994.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9099973440170288, "sampling/importance_sampling_ratio/mean": 0.9994252920150757, "sampling/importance_sampling_ratio/min": 0.29913097620010376, "sampling/sampling_logp_difference/max": 1.2068737745285034, "sampling/sampling_logp_difference/mean": 0.013492463156580925, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 217.3125, "completions/mean_terminated_length": 217.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23784899711608887, "epoch": 2.3566176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.6033230741854918, "kl": 0.07405094802379608, "learning_rate": 1.3541816965243462e-07, "loss": -0.0331, "num_tokens": 60732646.0, "reward": 0.125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996098279953003, "sampling/importance_sampling_ratio/min": 0.3956761956214905, "sampling/sampling_logp_difference/max": 0.9771990776062012, "sampling/sampling_logp_difference/mean": 0.013517213985323906, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 145.296875, "completions/mean_terminated_length": 145.296875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.21970178186893463, "epoch": 2.357843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0700749236722222, "kl": 0.082846499979496, "learning_rate": 1.3493103067329737e-07, "loss": 0.0008, "num_tokens": 60760249.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7992300987243652, "sampling/importance_sampling_ratio/mean": 1.000280499458313, "sampling/importance_sampling_ratio/min": 0.5525192618370056, "sampling/sampling_logp_difference/max": 0.5932669639587402, "sampling/sampling_logp_difference/mean": 0.01530250534415245, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 271.921875, "completions/mean_terminated_length": 271.921875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.21385936439037323, "epoch": 2.3590686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 1.5391506686481062, "kl": 0.06280796229839325, "learning_rate": 1.3444463276104012e-07, "loss": 0.0592, "num_tokens": 60796628.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8644733428955078, "sampling/importance_sampling_ratio/mean": 1.0002459287643433, "sampling/importance_sampling_ratio/min": 0.537388801574707, "sampling/sampling_logp_difference/max": 0.622978687286377, "sampling/sampling_logp_difference/mean": 0.014365228824317455, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 200.84375, "completions/mean_terminated_length": 200.84375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2779545783996582, "epoch": 2.360294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.80709516557649, "kl": 0.09816673398017883, "learning_rate": 1.3395897690301966e-07, "loss": 0.171, "num_tokens": 60829978.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7608885765075684, "sampling/importance_sampling_ratio/mean": 1.000191330909729, "sampling/importance_sampling_ratio/min": 0.5771064162254333, "sampling/sampling_logp_difference/max": 0.5658185482025146, "sampling/sampling_logp_difference/mean": 0.014423665590584278, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 232.703125, "completions/mean_terminated_length": 232.703125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.21351173520088196, "epoch": 2.361519607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.970063551926708, "kl": 0.06985197961330414, "learning_rate": 1.3347406408508694e-07, "loss": 0.0851, "num_tokens": 60860887.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6141871213912964, "sampling/importance_sampling_ratio/mean": 1.0003697872161865, "sampling/importance_sampling_ratio/min": 0.4937157928943634, "sampling/sampling_logp_difference/max": 0.7057952880859375, "sampling/sampling_logp_difference/mean": 0.011303408071398735, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 238.671875, "completions/mean_terminated_length": 238.671875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.23693378269672394, "epoch": 2.3627450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.4764931315312262, "kl": 0.062237031757831573, "learning_rate": 1.3298989529158378e-07, "loss": 0.0926, "num_tokens": 60896994.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5772426128387451, "sampling/importance_sampling_ratio/mean": 0.9998999834060669, "sampling/importance_sampling_ratio/min": 0.5110949873924255, "sampling/sampling_logp_difference/max": 0.6711997985839844, "sampling/sampling_logp_difference/mean": 0.012698430567979813, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 148.5625, "completions/mean_terminated_length": 148.5625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.1978602111339569, "epoch": 2.363970588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.4896909152778242, "kl": 0.09970251470804214, "learning_rate": 1.325064715053425e-07, "loss": 0.0155, "num_tokens": 60920598.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.581521987915039, "sampling/importance_sampling_ratio/mean": 1.0005525350570679, "sampling/importance_sampling_ratio/min": 0.5561189651489258, "sampling/sampling_logp_difference/max": 0.5867730379104614, "sampling/sampling_logp_difference/mean": 0.012599577195942402, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 222.046875, "completions/mean_terminated_length": 222.046875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.29237282276153564, "epoch": 2.3651960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.298288706391106, "kl": 0.08031923323869705, "learning_rate": 1.320237937076825e-07, "loss": 0.0022, "num_tokens": 60953257.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.65193510055542, "sampling/importance_sampling_ratio/mean": 0.999497652053833, "sampling/importance_sampling_ratio/min": 0.4843113422393799, "sampling/sampling_logp_difference/max": 0.725027322769165, "sampling/sampling_logp_difference/mean": 0.015679839998483658, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 185.296875, "completions/mean_terminated_length": 185.296875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2450130134820938, "epoch": 2.366421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.07191566437545724, "kl": 0.08761993050575256, "learning_rate": 1.3154186287840946e-07, "loss": 0.0009, "num_tokens": 60984252.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8617053031921387, "sampling/importance_sampling_ratio/mean": 1.0000516176223755, "sampling/importance_sampling_ratio/min": 0.4116455316543579, "sampling/sampling_logp_difference/max": 0.8875926733016968, "sampling/sampling_logp_difference/mean": 0.015412520617246628, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2743494510650635, "epoch": 2.3676470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.5918629660933312, "kl": 0.112302765250206, "learning_rate": 1.310606799958122e-07, "loss": 0.0379, "num_tokens": 61018288.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6630561351776123, "sampling/importance_sampling_ratio/mean": 1.0001001358032227, "sampling/importance_sampling_ratio/min": 0.6094446778297424, "sampling/sampling_logp_difference/max": 0.5086569786071777, "sampling/sampling_logp_difference/mean": 0.015683766454458237, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 227.078125, "completions/mean_terminated_length": 227.078125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.27433329820632935, "epoch": 2.368872549019608, "frac_reward_zero_std": 0.5, "grad_norm": 1.584208504819516, "kl": 0.06815879046916962, "learning_rate": 1.305802460366615e-07, "loss": 0.0219, "num_tokens": 61057557.0, "reward": 0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.897632122039795, "sampling/importance_sampling_ratio/mean": 0.9998785257339478, "sampling/importance_sampling_ratio/min": 0.5910902619361877, "sampling/sampling_logp_difference/max": 0.6406068801879883, "sampling/sampling_logp_difference/mean": 0.01572701334953308, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 156.140625, "completions/mean_terminated_length": 156.140625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.23684397339820862, "epoch": 2.3700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.2777498029803074, "kl": 0.08773821592330933, "learning_rate": 1.3010056197620812e-07, "loss": -0.0014, "num_tokens": 61087326.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8796398639678955, "sampling/importance_sampling_ratio/mean": 1.0005260705947876, "sampling/importance_sampling_ratio/min": 0.3654533624649048, "sampling/sampling_logp_difference/max": 1.0066165924072266, "sampling/sampling_logp_difference/mean": 0.01609150506556034, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 251.4375, "completions/mean_terminated_length": 251.4375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.33900707960128784, "epoch": 2.3713235294117645, "frac_reward_zero_std": 0.25, "grad_norm": 2.074405565419832, "kl": 0.1103014275431633, "learning_rate": 1.2962162878817985e-07, "loss": 0.0191, "num_tokens": 61125354.0, "reward": 0.0625, "reward_std": 0.5501632690429688, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5916136503219604, "sampling/importance_sampling_ratio/mean": 1.0007879734039307, "sampling/importance_sampling_ratio/min": 0.60539710521698, "sampling/sampling_logp_difference/max": 0.5018706321716309, "sampling/sampling_logp_difference/mean": 0.017022768035531044, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 242.453125, "completions/mean_terminated_length": 242.453125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.23958539962768555, "epoch": 2.372549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 2.5269838185566615, "kl": 0.08082319051027298, "learning_rate": 1.2914344744478112e-07, "loss": 0.0932, "num_tokens": 61159911.0, "reward": 0.90625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.7728302478790283, "sampling/importance_sampling_ratio/mean": 1.0002799034118652, "sampling/importance_sampling_ratio/min": 0.6110666990280151, "sampling/sampling_logp_difference/max": 0.5725772380828857, "sampling/sampling_logp_difference/mean": 0.013866308145225048, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 203.890625, "completions/mean_terminated_length": 203.890625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.22986426949501038, "epoch": 2.373774509803922, "frac_reward_zero_std": 0.75, "grad_norm": 1.042505482485899, "kl": 0.08157625794410706, "learning_rate": 1.2866601891668942e-07, "loss": 0.0314, "num_tokens": 61189232.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5656235218048096, "sampling/importance_sampling_ratio/mean": 1.000196099281311, "sampling/importance_sampling_ratio/min": 0.3692629337310791, "sampling/sampling_logp_difference/max": 0.996246337890625, "sampling/sampling_logp_difference/mean": 0.013883614912629128, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 176.28125, "completions/mean_terminated_length": 176.28125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.1920793056488037, "epoch": 2.375, "frac_reward_zero_std": 0.75, "grad_norm": 1.3512202642752835, "kl": 0.09101974219083786, "learning_rate": 1.2818934417305477e-07, "loss": 0.0611, "num_tokens": 61217746.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6356041431427002, "sampling/importance_sampling_ratio/mean": 1.0001342296600342, "sampling/importance_sampling_ratio/min": 0.18803855776786804, "sampling/sampling_logp_difference/max": 1.6711082458496094, "sampling/sampling_logp_difference/mean": 0.012220809236168861, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 189.171875, "completions/mean_terminated_length": 189.171875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.22339913249015808, "epoch": 2.376225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.3587392805565413, "kl": 0.07989152520895004, "learning_rate": 1.2771342418149656e-07, "loss": -0.0014, "num_tokens": 61250909.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.753654956817627, "sampling/importance_sampling_ratio/mean": 1.00027334690094, "sampling/importance_sampling_ratio/min": 0.2822793126106262, "sampling/sampling_logp_difference/max": 1.2648582458496094, "sampling/sampling_logp_difference/mean": 0.015397697687149048, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 212.09375, "completions/mean_terminated_length": 212.09375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.333929181098938, "epoch": 2.377450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.6350576499593368, "kl": 0.15975189208984375, "learning_rate": 1.2723825990810204e-07, "loss": 0.0341, "num_tokens": 61283187.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002503395080566, "sampling/importance_sampling_ratio/min": 0.6097145676612854, "sampling/sampling_logp_difference/max": 0.7700153589248657, "sampling/sampling_logp_difference/mean": 0.01582321710884571, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 200.515625, "completions/mean_terminated_length": 200.515625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.1840367615222931, "epoch": 2.3786764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.836295613462871, "kl": 0.06945012509822845, "learning_rate": 1.2676385231742494e-07, "loss": 0.0797, "num_tokens": 61312564.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.8142162561416626, "sampling/importance_sampling_ratio/mean": 1.0005759000778198, "sampling/importance_sampling_ratio/min": 0.4323919415473938, "sampling/sampling_logp_difference/max": 0.8384228944778442, "sampling/sampling_logp_difference/mean": 0.011091177351772785, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.19584491848945618, "epoch": 2.3799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.3394160207249557, "kl": 0.05044228583574295, "learning_rate": 1.262902023724824e-07, "loss": -0.0549, "num_tokens": 61342788.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5584990978240967, "sampling/importance_sampling_ratio/mean": 0.9999395608901978, "sampling/importance_sampling_ratio/min": 0.5607408285140991, "sampling/sampling_logp_difference/max": 0.5784964561462402, "sampling/sampling_logp_difference/mean": 0.01116795465350151, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 226.15625, "completions/mean_terminated_length": 226.15625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3171435296535492, "epoch": 2.381127450980392, "frac_reward_zero_std": 0.25, "grad_norm": 1.7356636484164167, "kl": 0.12276682257652283, "learning_rate": 1.258173110347538e-07, "loss": -0.0132, "num_tokens": 61386430.0, "reward": -0.21875, "reward_std": 0.7297805547714233, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.7008626461029053, "sampling/importance_sampling_ratio/mean": 1.0000245571136475, "sampling/importance_sampling_ratio/min": 0.48895463347435, "sampling/sampling_logp_difference/max": 0.7154855728149414, "sampling/sampling_logp_difference/mean": 0.01608789712190628, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 205.171875, "completions/mean_terminated_length": 205.171875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2562256455421448, "epoch": 2.3823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.7410033887855307, "kl": 0.11961342394351959, "learning_rate": 1.253451792641785e-07, "loss": 0.0057, "num_tokens": 61418857.0, "reward": 0.0, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.702419638633728, "sampling/importance_sampling_ratio/mean": 0.9998294711112976, "sampling/importance_sampling_ratio/min": 0.5145463347434998, "sampling/sampling_logp_difference/max": 0.6644695997238159, "sampling/sampling_logp_difference/mean": 0.015101276338100433, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3039652705192566, "epoch": 2.383578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.344275615023439, "kl": 0.10801234841346741, "learning_rate": 1.248738080191543e-07, "loss": 0.0113, "num_tokens": 61446689.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.911575436592102, "sampling/importance_sampling_ratio/mean": 1.0003383159637451, "sampling/importance_sampling_ratio/min": 0.5024508833885193, "sampling/sampling_logp_difference/max": 0.6882574558258057, "sampling/sampling_logp_difference/mean": 0.016542499884963036, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 173.09375, "completions/mean_terminated_length": 173.09375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.24207086861133575, "epoch": 2.3848039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.06780617246457922, "kl": 0.10478676110506058, "learning_rate": 1.244031982565349e-07, "loss": 0.0012, "num_tokens": 61472359.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4732166528701782, "sampling/importance_sampling_ratio/mean": 0.9993559122085571, "sampling/importance_sampling_ratio/min": 0.5910095572471619, "sampling/sampling_logp_difference/max": 0.5259230136871338, "sampling/sampling_logp_difference/mean": 0.013632368296384811, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 189.171875, "completions/mean_terminated_length": 189.171875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.31813693046569824, "epoch": 2.386029411764706, "frac_reward_zero_std": 0.25, "grad_norm": 2.458377801973784, "kl": 0.11870501935482025, "learning_rate": 1.239333509316281e-07, "loss": -0.0297, "num_tokens": 61505122.0, "reward": 0.40625, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.9482307434082031, "sampling/importance_sampling_ratio/mean": 0.9993807077407837, "sampling/importance_sampling_ratio/min": 0.5460110306739807, "sampling/sampling_logp_difference/max": 0.6669216156005859, "sampling/sampling_logp_difference/mean": 0.016893664374947548, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 200.96875, "completions/mean_terminated_length": 200.96875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.21835170686244965, "epoch": 2.3872549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.3072731399420443, "kl": 0.07475827634334564, "learning_rate": 1.2346426699819456e-07, "loss": 0.0075, "num_tokens": 61536496.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6572539806365967, "sampling/importance_sampling_ratio/mean": 1.000198483467102, "sampling/importance_sampling_ratio/min": 0.5988007187843323, "sampling/sampling_logp_difference/max": 0.5128264427185059, "sampling/sampling_logp_difference/mean": 0.0124274967238307, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 192.59375, "completions/mean_terminated_length": 192.59375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.1802973747253418, "epoch": 2.388480392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.04346566312788138, "kl": 0.062366824597120285, "learning_rate": 1.2299594740844476e-07, "loss": 0.0006, "num_tokens": 61565702.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8180396556854248, "sampling/importance_sampling_ratio/mean": 1.0000131130218506, "sampling/importance_sampling_ratio/min": 0.6013502478599548, "sampling/sampling_logp_difference/max": 0.5977587699890137, "sampling/sampling_logp_difference/mean": 0.011643131263554096, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 160.71875, "completions/mean_terminated_length": 160.71875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.21568229794502258, "epoch": 2.389705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.07281075137744002, "kl": 0.10016850382089615, "learning_rate": 1.225283931130378e-07, "loss": 0.001, "num_tokens": 61590868.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4943758249282837, "sampling/importance_sampling_ratio/mean": 0.9999530911445618, "sampling/importance_sampling_ratio/min": 0.48211976885795593, "sampling/sampling_logp_difference/max": 0.7295627593994141, "sampling/sampling_logp_difference/mean": 0.013741142116487026, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.23186248540878296, "epoch": 2.3909313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.042213109083424606, "kl": 0.06597447395324707, "learning_rate": 1.220616050610791e-07, "loss": 0.0006, "num_tokens": 61618916.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997356534004211, "sampling/importance_sampling_ratio/min": 0.5676605105400085, "sampling/sampling_logp_difference/max": 0.8005461692810059, "sampling/sampling_logp_difference/mean": 0.014332741498947144, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 190.359375, "completions/mean_terminated_length": 190.359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24763555824756622, "epoch": 2.392156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.3624870551913222, "kl": 0.07363063097000122, "learning_rate": 1.2159558420011905e-07, "loss": 0.0208, "num_tokens": 61649259.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6884815692901611, "sampling/importance_sampling_ratio/mean": 1.000169277191162, "sampling/importance_sampling_ratio/min": 0.3574312925338745, "sampling/sampling_logp_difference/max": 1.0288121700286865, "sampling/sampling_logp_difference/mean": 0.014056330546736717, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 240.234375, "completions/mean_terminated_length": 240.234375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.20466625690460205, "epoch": 2.3933823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 1.442820055251828, "kl": 0.09523437917232513, "learning_rate": 1.2113033147615071e-07, "loss": 0.0382, "num_tokens": 61679434.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002615451812744, "sampling/importance_sampling_ratio/min": 0.38819387555122375, "sampling/sampling_logp_difference/max": 1.252941608428955, "sampling/sampling_logp_difference/mean": 0.01280001550912857, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 187.640625, "completions/mean_terminated_length": 187.640625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2240035980939865, "epoch": 2.394607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.2719615435687175, "kl": 0.05534994229674339, "learning_rate": 1.206658478336071e-07, "loss": 0.0253, "num_tokens": 61708291.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.601393699645996, "sampling/importance_sampling_ratio/mean": 1.0005192756652832, "sampling/importance_sampling_ratio/min": 0.6298378109931946, "sampling/sampling_logp_difference/max": 0.4708743095397949, "sampling/sampling_logp_difference/mean": 0.012002687901258469, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 189.609375, "completions/mean_terminated_length": 189.609375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2199474275112152, "epoch": 2.3958333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.043589706437910984, "kl": 0.07419893145561218, "learning_rate": 1.2020213421536103e-07, "loss": 0.0007, "num_tokens": 61737386.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999531090259552, "sampling/importance_sampling_ratio/min": 0.5294254422187805, "sampling/sampling_logp_difference/max": 0.7695995569229126, "sampling/sampling_logp_difference/mean": 0.013445645570755005, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 184.671875, "completions/mean_terminated_length": 184.671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.309781551361084, "epoch": 2.3970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.05492677060050852, "kl": 0.08925427496433258, "learning_rate": 1.1973919156272138e-07, "loss": 0.0009, "num_tokens": 61772485.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7128942012786865, "sampling/importance_sampling_ratio/mean": 1.0004339218139648, "sampling/importance_sampling_ratio/min": 0.48819631338119507, "sampling/sampling_logp_difference/max": 0.7170376777648926, "sampling/sampling_logp_difference/mean": 0.017541082575917244, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 178.140625, "completions/mean_terminated_length": 178.140625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.27200642228126526, "epoch": 2.3982843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.6495497483034853, "kl": 0.09907713532447815, "learning_rate": 1.1927702081543278e-07, "loss": 0.0149, "num_tokens": 61802238.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6173079013824463, "sampling/importance_sampling_ratio/mean": 1.0000338554382324, "sampling/importance_sampling_ratio/min": 0.4597283601760864, "sampling/sampling_logp_difference/max": 0.777119517326355, "sampling/sampling_logp_difference/mean": 0.01572125218808651, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 202.1875, "completions/mean_terminated_length": 202.1875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2908550202846527, "epoch": 2.3995098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.9769531666950955, "kl": 0.08010758459568024, "learning_rate": 1.188156229116724e-07, "loss": 0.0103, "num_tokens": 61843258.0, "reward": -0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6162408590316772, "sampling/importance_sampling_ratio/mean": 1.000512719154358, "sampling/importance_sampling_ratio/min": 0.5101743936538696, "sampling/sampling_logp_difference/max": 0.6730027198791504, "sampling/sampling_logp_difference/mean": 0.01668713241815567, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 180.828125, "completions/mean_terminated_length": 180.828125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.21159318089485168, "epoch": 2.400735294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.9940042567980671, "kl": 0.06886664032936096, "learning_rate": 1.1835499878804861e-07, "loss": 0.0111, "num_tokens": 61873087.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8365287780761719, "sampling/importance_sampling_ratio/mean": 0.999881386756897, "sampling/importance_sampling_ratio/min": 0.4955132007598877, "sampling/sampling_logp_difference/max": 0.7021613121032715, "sampling/sampling_logp_difference/mean": 0.012495124712586403, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 139.953125, "completions/mean_terminated_length": 139.953125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2683044672012329, "epoch": 2.4019607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.09810569386867438, "kl": 0.09765708446502686, "learning_rate": 1.1789514937959965e-07, "loss": 0.001, "num_tokens": 61897340.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6773133277893066, "sampling/importance_sampling_ratio/mean": 0.9999996423721313, "sampling/importance_sampling_ratio/min": 0.5363419651985168, "sampling/sampling_logp_difference/max": 0.6229833364486694, "sampling/sampling_logp_difference/mean": 0.016239367425441742, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 196.03125, "completions/mean_terminated_length": 196.03125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.34146174788475037, "epoch": 2.403186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.1141920873923354, "kl": 0.14285296201705933, "learning_rate": 1.1743607561979013e-07, "loss": 0.0053, "num_tokens": 61929310.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4234881401062012, "sampling/importance_sampling_ratio/mean": 0.9995550513267517, "sampling/importance_sampling_ratio/min": 0.5124226212501526, "sampling/sampling_logp_difference/max": 0.6686055660247803, "sampling/sampling_logp_difference/mean": 0.017412405461072922, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 148.96875, "completions/mean_terminated_length": 148.96875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.21372148394584656, "epoch": 2.4044117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 2.0202041764371432, "kl": 0.10127365589141846, "learning_rate": 1.1697777844051104e-07, "loss": 0.0277, "num_tokens": 61955708.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5071592330932617, "sampling/importance_sampling_ratio/mean": 1.000234842300415, "sampling/importance_sampling_ratio/min": 0.47133302688598633, "sampling/sampling_logp_difference/max": 0.752190351486206, "sampling/sampling_logp_difference/mean": 0.01135617308318615, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 176.421875, "completions/mean_terminated_length": 176.421875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.1962416172027588, "epoch": 2.405637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.03121125047462608, "kl": 0.04818423464894295, "learning_rate": 1.1652025877207644e-07, "loss": 0.0005, "num_tokens": 61983719.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6769741773605347, "sampling/importance_sampling_ratio/mean": 1.00025475025177, "sampling/importance_sampling_ratio/min": 0.5177282094955444, "sampling/sampling_logp_difference/max": 0.6583049297332764, "sampling/sampling_logp_difference/mean": 0.012465772219002247, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 153.203125, "completions/mean_terminated_length": 153.203125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.23137956857681274, "epoch": 2.406862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0740601484478167, "kl": 0.08045706152915955, "learning_rate": 1.1606351754322247e-07, "loss": 0.0009, "num_tokens": 62008612.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8397560119628906, "sampling/importance_sampling_ratio/mean": 1.000663161277771, "sampling/importance_sampling_ratio/min": 0.5463835597038269, "sampling/sampling_logp_difference/max": 0.6096329689025879, "sampling/sampling_logp_difference/mean": 0.014718655496835709, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 171.984375, "completions/mean_terminated_length": 171.984375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3546157479286194, "epoch": 2.4080882352941178, "frac_reward_zero_std": 0.25, "grad_norm": 2.1880953904965823, "kl": 0.11963890492916107, "learning_rate": 1.156075556811048e-07, "loss": -0.031, "num_tokens": 62039075.0, "reward": -0.125, "reward_std": 0.5738953948020935, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4352010488510132, "sampling/importance_sampling_ratio/mean": 0.999981164932251, "sampling/importance_sampling_ratio/min": 0.5505171418190002, "sampling/sampling_logp_difference/max": 0.5968972444534302, "sampling/sampling_logp_difference/mean": 0.017265722155570984, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 151.484375, "completions/mean_terminated_length": 151.484375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.22168831527233124, "epoch": 2.409313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.6869881388829138, "kl": 0.10283079743385315, "learning_rate": 1.1515237411129697e-07, "loss": 0.0652, "num_tokens": 62069026.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999632716178894, "sampling/importance_sampling_ratio/min": 0.5867922902107239, "sampling/sampling_logp_difference/max": 0.7418670654296875, "sampling/sampling_logp_difference/mean": 0.012990560382604599, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 163.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.21114423871040344, "epoch": 2.4105392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.65379610952179, "kl": 0.12189424782991409, "learning_rate": 1.1469797375778901e-07, "loss": 0.0623, "num_tokens": 62092302.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6554591655731201, "sampling/importance_sampling_ratio/mean": 1.0000386238098145, "sampling/importance_sampling_ratio/min": 0.6058178544044495, "sampling/sampling_logp_difference/max": 0.5040783882141113, "sampling/sampling_logp_difference/mean": 0.012203224934637547, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 208.828125, "completions/mean_terminated_length": 208.828125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.2707853615283966, "epoch": 2.411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.4753170204784647, "kl": 0.07635916769504547, "learning_rate": 1.1424435554298473e-07, "loss": -0.0148, "num_tokens": 62127123.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7422895431518555, "sampling/importance_sampling_ratio/mean": 1.0000462532043457, "sampling/importance_sampling_ratio/min": 0.5663594603538513, "sampling/sampling_logp_difference/max": 0.5685262680053711, "sampling/sampling_logp_difference/mean": 0.014813482761383057, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 206.140625, "completions/mean_terminated_length": 206.140625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.30891549587249756, "epoch": 2.4129901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 2.1459926530914273, "kl": 0.0987582579255104, "learning_rate": 1.1379152038770029e-07, "loss": 0.0225, "num_tokens": 62161564.0, "reward": 0.6875, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4569240808486938, "sampling/importance_sampling_ratio/mean": 0.9999404549598694, "sampling/importance_sampling_ratio/min": 0.49131664633750916, "sampling/sampling_logp_difference/max": 0.7106664180755615, "sampling/sampling_logp_difference/mean": 0.016402438282966614, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 197.53125, "completions/mean_terminated_length": 197.53125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2572060227394104, "epoch": 2.4142156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.257642491145384, "kl": 0.1277352273464203, "learning_rate": 1.1333946921116234e-07, "loss": -0.019, "num_tokens": 62190302.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5978838205337524, "sampling/importance_sampling_ratio/mean": 1.0002050399780273, "sampling/importance_sampling_ratio/min": 0.6207980513572693, "sampling/sampling_logp_difference/max": 0.4767494201660156, "sampling/sampling_logp_difference/mean": 0.0142198596149683, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 200.046875, "completions/mean_terminated_length": 200.046875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.30708110332489014, "epoch": 2.4154411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.6849725653324623, "kl": 0.07924254238605499, "learning_rate": 1.1288820293100637e-07, "loss": -0.021, "num_tokens": 62222513.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991940259933472, "sampling/importance_sampling_ratio/min": 0.38363999128341675, "sampling/sampling_logp_difference/max": 1.9116621017456055, "sampling/sampling_logp_difference/mean": 0.016897523775696754, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 169.421875, "completions/mean_terminated_length": 169.421875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2777869701385498, "epoch": 2.4166666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 2.049756210521659, "kl": 0.09561365842819214, "learning_rate": 1.1243772246327415e-07, "loss": -0.0874, "num_tokens": 62252716.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003769397735596, "sampling/importance_sampling_ratio/min": 0.5252593159675598, "sampling/sampling_logp_difference/max": 0.7423210144042969, "sampling/sampling_logp_difference/mean": 0.01625244691967964, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 255.96875, "completions/mean_terminated_length": 255.96875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.32085007429122925, "epoch": 2.417892156862745, "frac_reward_zero_std": 0.25, "grad_norm": 1.8697659940414395, "kl": 0.09609529376029968, "learning_rate": 1.1198802872241242e-07, "loss": -0.0031, "num_tokens": 62288458.0, "reward": 0.5, "reward_std": 0.6663130521774292, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5612561702728271, "sampling/importance_sampling_ratio/mean": 0.9996238946914673, "sampling/importance_sampling_ratio/min": 0.3742508292198181, "sampling/sampling_logp_difference/max": 0.9828290939331055, "sampling/sampling_logp_difference/mean": 0.01732058823108673, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 145.78125, "completions/mean_terminated_length": 145.78125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.19306042790412903, "epoch": 2.4191176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.07594578510975517, "kl": 0.09978260099887848, "learning_rate": 1.1153912262127119e-07, "loss": 0.0011, "num_tokens": 62317868.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.910028338432312, "sampling/importance_sampling_ratio/mean": 1.0001709461212158, "sampling/importance_sampling_ratio/min": 0.5480507016181946, "sampling/sampling_logp_difference/max": 0.647118091583252, "sampling/sampling_logp_difference/mean": 0.014451880939304829, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 198.6875, "completions/mean_terminated_length": 198.6875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.20174993574619293, "epoch": 2.420343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.08789469737367238, "kl": 0.09293520450592041, "learning_rate": 1.1109100507110131e-07, "loss": 0.0008, "num_tokens": 62344376.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071581602096558, "sampling/importance_sampling_ratio/mean": 1.0001535415649414, "sampling/importance_sampling_ratio/min": 0.47808966040611267, "sampling/sampling_logp_difference/max": 0.7379570007324219, "sampling/sampling_logp_difference/mean": 0.011941138654947281, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 169.828125, "completions/mean_terminated_length": 169.828125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3261312246322632, "epoch": 2.4215686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.3812259383358159, "kl": 0.08990463614463806, "learning_rate": 1.1064367698155303e-07, "loss": 0.0094, "num_tokens": 62377165.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6962614059448242, "sampling/importance_sampling_ratio/mean": 0.9996572136878967, "sampling/importance_sampling_ratio/min": 0.6097140312194824, "sampling/sampling_logp_difference/max": 0.5284266471862793, "sampling/sampling_logp_difference/mean": 0.016119036823511124, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 165.265625, "completions/mean_terminated_length": 165.265625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.22535020112991333, "epoch": 2.422794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 2.017966091119861, "kl": 0.09670273959636688, "learning_rate": 1.1019713926067392e-07, "loss": 0.0178, "num_tokens": 62405614.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000312328338623, "sampling/importance_sampling_ratio/min": 0.44563791155815125, "sampling/sampling_logp_difference/max": 1.1860003471374512, "sampling/sampling_logp_difference/mean": 0.014435198158025742, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 277.328125, "completions/mean_terminated_length": 277.328125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24867111444473267, "epoch": 2.424019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.2647538538960692, "kl": 0.10145062953233719, "learning_rate": 1.0975139281490747e-07, "loss": -0.0364, "num_tokens": 62442195.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7393453121185303, "sampling/importance_sampling_ratio/mean": 1.0000133514404297, "sampling/importance_sampling_ratio/min": 0.5520194172859192, "sampling/sampling_logp_difference/max": 0.5941720008850098, "sampling/sampling_logp_difference/mean": 0.013782719150185585, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 172.8125, "completions/mean_terminated_length": 172.8125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.24885821342468262, "epoch": 2.4252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.07125148858301587, "kl": 0.1279226541519165, "learning_rate": 1.093064385490906e-07, "loss": 0.0011, "num_tokens": 62467607.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5403485298156738, "sampling/importance_sampling_ratio/mean": 1.0000057220458984, "sampling/importance_sampling_ratio/min": 0.6010531783103943, "sampling/sampling_logp_difference/max": 0.5090718269348145, "sampling/sampling_logp_difference/mean": 0.01431182585656643, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 255.421875, "completions/mean_terminated_length": 255.421875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.22771960496902466, "epoch": 2.426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.7144092251646503, "kl": 0.08566172420978546, "learning_rate": 1.0886227736645215e-07, "loss": -0.0003, "num_tokens": 62506674.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999967098236084, "sampling/importance_sampling_ratio/min": 0.2704428732395172, "sampling/sampling_logp_difference/max": 1.307694435119629, "sampling/sampling_logp_difference/mean": 0.01397204864770174, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3569865822792053, "epoch": 2.4276960784313726, "frac_reward_zero_std": 0.0, "grad_norm": 2.6258296681879294, "kl": 0.1403682827949524, "learning_rate": 1.0841891016861155e-07, "loss": 0.0063, "num_tokens": 62540242.0, "reward": 0.3125, "reward_std": 0.8389039635658264, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6555317640304565, "sampling/importance_sampling_ratio/mean": 0.9997969269752502, "sampling/importance_sampling_ratio/min": 0.3920708894729614, "sampling/sampling_logp_difference/max": 0.9363126754760742, "sampling/sampling_logp_difference/mean": 0.01937026157975197, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 178.09375, "completions/mean_terminated_length": 178.09375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.22271037101745605, "epoch": 2.428921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.973184989448742, "kl": 0.10955817997455597, "learning_rate": 1.0797633785557581e-07, "loss": 0.0177, "num_tokens": 62573192.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6316500902175903, "sampling/importance_sampling_ratio/mean": 1.0002367496490479, "sampling/importance_sampling_ratio/min": 0.6065034866333008, "sampling/sampling_logp_difference/max": 0.5000448226928711, "sampling/sampling_logp_difference/mean": 0.013782722875475883, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 171.28125, "completions/mean_terminated_length": 171.28125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.23561523854732513, "epoch": 2.4301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.5343826417136237, "kl": 0.07251683622598648, "learning_rate": 1.0753456132573885e-07, "loss": 0.0383, "num_tokens": 62604570.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5926032066345215, "sampling/importance_sampling_ratio/mean": 1.0000483989715576, "sampling/importance_sampling_ratio/min": 0.47895604372024536, "sampling/sampling_logp_difference/max": 0.7361464500427246, "sampling/sampling_logp_difference/mean": 0.013745970092713833, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 195.296875, "completions/mean_terminated_length": 195.296875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2558283507823944, "epoch": 2.431372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.17368507304050582, "kl": 0.07469744980335236, "learning_rate": 1.0709358147587883e-07, "loss": 0.0008, "num_tokens": 62636733.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6177139282226562, "sampling/importance_sampling_ratio/mean": 1.0005903244018555, "sampling/importance_sampling_ratio/min": 0.4985237419605255, "sampling/sampling_logp_difference/max": 0.6961040496826172, "sampling/sampling_logp_difference/mean": 0.015179083682596684, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 231.3125, "completions/mean_terminated_length": 231.3125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.29034364223480225, "epoch": 2.4325980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.0525714732231246, "kl": 0.09454121440649033, "learning_rate": 1.0665339920115718e-07, "loss": 0.0064, "num_tokens": 62668737.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6174228191375732, "sampling/importance_sampling_ratio/mean": 0.9998698830604553, "sampling/importance_sampling_ratio/min": 0.46674081683158875, "sampling/sampling_logp_difference/max": 0.7619811296463013, "sampling/sampling_logp_difference/mean": 0.015458998270332813, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 176.734375, "completions/mean_terminated_length": 176.734375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2578873038291931, "epoch": 2.4338235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.18534368544284197, "kl": 0.10618950426578522, "learning_rate": 1.0621401539511587e-07, "loss": 0.001, "num_tokens": 62702896.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.597326636314392, "sampling/importance_sampling_ratio/mean": 0.9995043277740479, "sampling/importance_sampling_ratio/min": 0.6241003274917603, "sampling/sampling_logp_difference/max": 0.47144412994384766, "sampling/sampling_logp_difference/mean": 0.015324244275689125, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 150.84375, "completions/mean_terminated_length": 150.84375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.21928420662879944, "epoch": 2.435049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.059385206113499005, "kl": 0.07461683452129364, "learning_rate": 1.0577543094967611e-07, "loss": 0.0007, "num_tokens": 62730070.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6468877792358398, "sampling/importance_sampling_ratio/mean": 0.9989715814590454, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.498887300491333, "sampling/sampling_logp_difference/mean": 0.014201447367668152, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 219.4375, "completions/mean_terminated_length": 219.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2593943476676941, "epoch": 2.436274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.04040715339197583, "kl": 0.059937912970781326, "learning_rate": 1.053376467551368e-07, "loss": 0.0006, "num_tokens": 62761218.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5471819639205933, "sampling/importance_sampling_ratio/mean": 1.000226616859436, "sampling/importance_sampling_ratio/min": 0.627091646194458, "sampling/sampling_logp_difference/max": 0.4666626453399658, "sampling/sampling_logp_difference/mean": 0.014485637657344341, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 178.234375, "completions/mean_terminated_length": 178.234375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.20651875436306, "epoch": 2.4375, "frac_reward_zero_std": 0.75, "grad_norm": 1.1828434759628255, "kl": 0.08197721838951111, "learning_rate": 1.0490066370017181e-07, "loss": 0.019, "num_tokens": 62789505.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.450416088104248, "sampling/importance_sampling_ratio/mean": 1.0003294944763184, "sampling/importance_sampling_ratio/min": 0.49541762471199036, "sampling/sampling_logp_difference/max": 0.7023541927337646, "sampling/sampling_logp_difference/mean": 0.012643104419112206, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 218.671875, "completions/mean_terminated_length": 218.671875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.17756140232086182, "epoch": 2.438725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.051933208386524814, "kl": 0.06588190793991089, "learning_rate": 1.044644826718295e-07, "loss": 0.0006, "num_tokens": 62826188.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997557997703552, "sampling/importance_sampling_ratio/min": 0.4160601496696472, "sampling/sampling_logp_difference/max": 0.8769254684448242, "sampling/sampling_logp_difference/mean": 0.011307923123240471, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 190.59375, "completions/mean_terminated_length": 190.59375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.24801194667816162, "epoch": 2.439950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.2933570005385122, "kl": 0.08753550052642822, "learning_rate": 1.0402910455552916e-07, "loss": -0.0104, "num_tokens": 62856754.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006647109985352, "sampling/importance_sampling_ratio/min": 0.5039380192756653, "sampling/sampling_logp_difference/max": 0.7099902629852295, "sampling/sampling_logp_difference/mean": 0.01531485840678215, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 193.03125, "completions/mean_terminated_length": 193.03125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.26168012619018555, "epoch": 2.4411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.7832935941676689, "kl": 0.11002017557621002, "learning_rate": 1.0359453023506121e-07, "loss": -0.0068, "num_tokens": 62884852.0, "reward": 0.5625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5547469854354858, "sampling/importance_sampling_ratio/mean": 1.0002416372299194, "sampling/importance_sampling_ratio/min": 0.6142948269844055, "sampling/sampling_logp_difference/max": 0.48728036880493164, "sampling/sampling_logp_difference/mean": 0.014116182923316956, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 193.71875, "completions/mean_terminated_length": 193.71875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.26052984595298767, "epoch": 2.4424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.8917314980948466, "kl": 0.08373277634382248, "learning_rate": 1.0316076059258389e-07, "loss": 0.0001, "num_tokens": 62914754.0, "reward": -0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5302009582519531, "sampling/importance_sampling_ratio/mean": 1.000388503074646, "sampling/importance_sampling_ratio/min": 0.5006714463233948, "sampling/sampling_logp_difference/max": 0.6918051242828369, "sampling/sampling_logp_difference/mean": 0.014232850633561611, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 174.609375, "completions/mean_terminated_length": 174.609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.1979416310787201, "epoch": 2.443627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 1.3102813169215466, "kl": 0.06639556586742401, "learning_rate": 1.0272779650862185e-07, "loss": -0.0236, "num_tokens": 62945577.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.6545871496200562, "sampling/importance_sampling_ratio/mean": 1.0002236366271973, "sampling/importance_sampling_ratio/min": 0.6114649772644043, "sampling/sampling_logp_difference/max": 0.5035514831542969, "sampling/sampling_logp_difference/mean": 0.012921427376568317, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 192.96875, "completions/mean_terminated_length": 192.96875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3212936520576477, "epoch": 2.4448529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.2865328555504383, "kl": 0.1091475859284401, "learning_rate": 1.0229563886206516e-07, "loss": 0.0219, "num_tokens": 62977271.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6545579433441162, "sampling/importance_sampling_ratio/mean": 0.9995372295379639, "sampling/importance_sampling_ratio/min": 0.5046824216842651, "sampling/sampling_logp_difference/max": 0.6838259696960449, "sampling/sampling_logp_difference/mean": 0.01750960759818554, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 145.265625, "completions/mean_terminated_length": 145.265625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.20676608383655548, "epoch": 2.446078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.05716501569543919, "kl": 0.06579281389713287, "learning_rate": 1.0186428853016604e-07, "loss": 0.0007, "num_tokens": 63007944.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9099797010421753, "sampling/importance_sampling_ratio/mean": 1.000302791595459, "sampling/importance_sampling_ratio/min": 0.5734854936599731, "sampling/sampling_logp_difference/max": 0.6470925807952881, "sampling/sampling_logp_difference/mean": 0.013321581296622753, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 182.671875, "completions/mean_terminated_length": 182.671875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2063009887933731, "epoch": 2.4473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.05980800829816122, "kl": 0.09165454655885696, "learning_rate": 1.0143374638853891e-07, "loss": 0.0009, "num_tokens": 63035635.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.575442910194397, "sampling/importance_sampling_ratio/mean": 0.9995609521865845, "sampling/importance_sampling_ratio/min": 0.432391881942749, "sampling/sampling_logp_difference/max": 0.8384230136871338, "sampling/sampling_logp_difference/mean": 0.014535813592374325, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 212.34375, "completions/mean_terminated_length": 212.34375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2586808204650879, "epoch": 2.448529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.027373811906634, "kl": 0.0688728392124176, "learning_rate": 1.0100401331115638e-07, "loss": 0.0055, "num_tokens": 63067177.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5745112895965576, "sampling/importance_sampling_ratio/mean": 0.9992469549179077, "sampling/importance_sampling_ratio/min": 0.6117782592773438, "sampling/sampling_logp_difference/max": 0.49138545989990234, "sampling/sampling_logp_difference/mean": 0.013521851971745491, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 212.3125, "completions/mean_terminated_length": 212.3125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.25559788942337036, "epoch": 2.4497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.04906564084842952, "kl": 0.0891442596912384, "learning_rate": 1.0057509017034977e-07, "loss": 0.0008, "num_tokens": 63098813.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6321675777435303, "sampling/importance_sampling_ratio/mean": 1.0008788108825684, "sampling/importance_sampling_ratio/min": 0.6126702427864075, "sampling/sampling_logp_difference/max": 0.4899284839630127, "sampling/sampling_logp_difference/mean": 0.015120787546038628, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 164.484375, "completions/mean_terminated_length": 164.484375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.20930957794189453, "epoch": 2.450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.06804692246110695, "kl": 0.07293498516082764, "learning_rate": 1.001469778368057e-07, "loss": 0.0007, "num_tokens": 63124828.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.696546196937561, "sampling/importance_sampling_ratio/mean": 0.9996410608291626, "sampling/importance_sampling_ratio/min": 0.43092525005340576, "sampling/sampling_logp_difference/max": 0.8418207168579102, "sampling/sampling_logp_difference/mean": 0.013784918002784252, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 155.09375, "completions/mean_terminated_length": 155.09375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.27410781383514404, "epoch": 2.452205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 1.4272336769540415, "kl": 0.09052129089832306, "learning_rate": 9.971967717956531e-08, "loss": 0.0204, "num_tokens": 63162322.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6674655675888062, "sampling/importance_sampling_ratio/mean": 0.9999108910560608, "sampling/importance_sampling_ratio/min": 0.3962820768356323, "sampling/sampling_logp_difference/max": 0.9256290197372437, "sampling/sampling_logp_difference/mean": 0.01571957767009735, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 168.390625, "completions/mean_terminated_length": 168.390625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2763534486293793, "epoch": 2.4534313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 1.7039991687458058, "kl": 0.10870914906263351, "learning_rate": 9.929318906602174e-08, "loss": 0.0185, "num_tokens": 63189323.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5515753030776978, "sampling/importance_sampling_ratio/mean": 0.9997731447219849, "sampling/importance_sampling_ratio/min": 0.44878554344177246, "sampling/sampling_logp_difference/max": 0.8012101054191589, "sampling/sampling_logp_difference/mean": 0.016712991520762444, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 234.671875, "completions/mean_terminated_length": 234.671875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.27719780802726746, "epoch": 2.454656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.877160944103908, "kl": 0.08174227178096771, "learning_rate": 9.886751436191871e-08, "loss": -0.0024, "num_tokens": 63224134.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6231054067611694, "sampling/importance_sampling_ratio/mean": 0.9999527931213379, "sampling/importance_sampling_ratio/min": 0.5397012829780579, "sampling/sampling_logp_difference/max": 0.6167394518852234, "sampling/sampling_logp_difference/mean": 0.014655927196145058, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 205.109375, "completions/mean_terminated_length": 205.109375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2259541153907776, "epoch": 2.4558823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.6899458495436732, "kl": 0.09486418217420578, "learning_rate": 9.844265393134926e-08, "loss": 0.0107, "num_tokens": 63256829.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002696514129639, "sampling/importance_sampling_ratio/min": 0.058544568717479706, "sampling/sampling_logp_difference/max": 2.8379669189453125, "sampling/sampling_logp_difference/mean": 0.015723761171102524, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.27056199312210083, "epoch": 2.457107843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.08672783765579291, "kl": 0.13522277772426605, "learning_rate": 9.801860863675266e-08, "loss": 0.0014, "num_tokens": 63286797.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.831900954246521, "sampling/importance_sampling_ratio/mean": 1.0001976490020752, "sampling/importance_sampling_ratio/min": 0.3242254853248596, "sampling/sampling_logp_difference/max": 1.1263160705566406, "sampling/sampling_logp_difference/mean": 0.01663844659924507, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 212.90625, "completions/mean_terminated_length": 212.90625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.23600170016288757, "epoch": 2.4583333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.06562357598801245, "kl": 0.09271427989006042, "learning_rate": 9.759537933891421e-08, "loss": 0.0009, "num_tokens": 63316615.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7565416097640991, "sampling/importance_sampling_ratio/mean": 1.0001403093338013, "sampling/importance_sampling_ratio/min": 0.5815595984458923, "sampling/sampling_logp_difference/max": 0.5633468627929688, "sampling/sampling_logp_difference/mean": 0.014578972943127155, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 174.84375, "completions/mean_terminated_length": 174.84375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2847607433795929, "epoch": 2.4595588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 2.0359462268745783, "kl": 0.16529709100723267, "learning_rate": 9.71729668969628e-08, "loss": -0.026, "num_tokens": 63345773.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5527472496032715, "sampling/importance_sampling_ratio/mean": 0.9999092221260071, "sampling/importance_sampling_ratio/min": 0.5738064050674438, "sampling/sampling_logp_difference/max": 0.5554631948471069, "sampling/sampling_logp_difference/mean": 0.01704687811434269, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 162.265625, "completions/mean_terminated_length": 162.265625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2768649160861969, "epoch": 2.4607843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 2.2699946394772597, "kl": 0.112728051841259, "learning_rate": 9.67513721683687e-08, "loss": -0.0048, "num_tokens": 63371598.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6522746086120605, "sampling/importance_sampling_ratio/mean": 0.9994333386421204, "sampling/importance_sampling_ratio/min": 0.6203518509864807, "sampling/sampling_logp_difference/max": 0.5021529197692871, "sampling/sampling_logp_difference/mean": 0.015261069871485233, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 210.484375, "completions/mean_terminated_length": 210.484375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2198856770992279, "epoch": 2.4620098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.2203288813385462, "kl": 0.06088901311159134, "learning_rate": 9.633059600894256e-08, "loss": -0.077, "num_tokens": 63410125.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.602226972579956, "sampling/importance_sampling_ratio/mean": 0.9999580383300781, "sampling/importance_sampling_ratio/min": 0.485906183719635, "sampling/sampling_logp_difference/max": 0.7217397689819336, "sampling/sampling_logp_difference/mean": 0.013644246384501457, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 173.609375, "completions/mean_terminated_length": 173.609375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.24299579858779907, "epoch": 2.463235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.5943768688521742, "kl": 0.06873179972171783, "learning_rate": 9.59106392728331e-08, "loss": -0.0071, "num_tokens": 63441172.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000420331954956, "sampling/importance_sampling_ratio/min": 0.6032364964485168, "sampling/sampling_logp_difference/max": 0.708549976348877, "sampling/sampling_logp_difference/mean": 0.014463325031101704, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 198.546875, "completions/mean_terminated_length": 198.546875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.23096302151679993, "epoch": 2.4644607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.3059021548108523, "kl": 0.07095976173877716, "learning_rate": 9.549150281252632e-08, "loss": 0.0013, "num_tokens": 63476039.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005818605422974, "sampling/importance_sampling_ratio/min": 0.2528098523616791, "sampling/sampling_logp_difference/max": 1.3751176595687866, "sampling/sampling_logp_difference/mean": 0.01635436713695526, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 207.671875, "completions/mean_terminated_length": 207.671875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.27460724115371704, "epoch": 2.465686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.4587355614766577, "kl": 0.08916577696800232, "learning_rate": 9.507318747884241e-08, "loss": -0.0033, "num_tokens": 63509426.0, "reward": 0.375, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000704526901245, "sampling/importance_sampling_ratio/min": 0.3304603397846222, "sampling/sampling_logp_difference/max": 1.1072685718536377, "sampling/sampling_logp_difference/mean": 0.013697458431124687, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 172.171875, "completions/mean_terminated_length": 172.171875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2268461287021637, "epoch": 2.4669117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.9820631840838865, "kl": 0.1107923835515976, "learning_rate": 9.465569412093488e-08, "loss": -0.0085, "num_tokens": 63536013.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001955032348633, "sampling/importance_sampling_ratio/min": 0.5419043898582458, "sampling/sampling_logp_difference/max": 1.5411171913146973, "sampling/sampling_logp_difference/mean": 0.013624858111143112, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 232.78125, "completions/mean_terminated_length": 232.78125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2935520112514496, "epoch": 2.468137254901961, "frac_reward_zero_std": 0.25, "grad_norm": 1.8649602721943874, "kl": 0.1160188540816307, "learning_rate": 9.423902358628916e-08, "loss": 0.0359, "num_tokens": 63575823.0, "reward": 0.5, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994773864746094, "sampling/importance_sampling_ratio/min": 0.5942912697792053, "sampling/sampling_logp_difference/max": 0.7240054607391357, "sampling/sampling_logp_difference/mean": 0.014655300416052341, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22237110137939453, "epoch": 2.469362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.149968194477422, "kl": 0.06632451713085175, "learning_rate": 9.382317672071966e-08, "loss": -0.0339, "num_tokens": 63603543.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6439599990844727, "sampling/importance_sampling_ratio/mean": 0.9999493360519409, "sampling/importance_sampling_ratio/min": 0.5227307677268982, "sampling/sampling_logp_difference/max": 0.648688793182373, "sampling/sampling_logp_difference/mean": 0.014232446439564228, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 225.3125, "completions/mean_terminated_length": 225.3125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.23953120410442352, "epoch": 2.4705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.1410351693893919, "kl": 0.09374180436134338, "learning_rate": 9.340815436836963e-08, "loss": 0.0009, "num_tokens": 63636507.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9712061882019043, "sampling/importance_sampling_ratio/mean": 1.0000889301300049, "sampling/importance_sampling_ratio/min": 0.5806989669799805, "sampling/sampling_logp_difference/max": 0.6786456108093262, "sampling/sampling_logp_difference/mean": 0.015163847245275974, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 251.96875, "completions/mean_terminated_length": 251.96875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.21348419785499573, "epoch": 2.471813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.991835265185055, "kl": 0.06538738310337067, "learning_rate": 9.299395737170757e-08, "loss": -0.034, "num_tokens": 63669849.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.8819410800933838, "sampling/importance_sampling_ratio/mean": 0.999263346195221, "sampling/importance_sampling_ratio/min": 0.08145135641098022, "sampling/sampling_logp_difference/max": 2.507749319076538, "sampling/sampling_logp_difference/mean": 0.012916372157633305, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 168.96875, "completions/mean_terminated_length": 168.96875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.19465279579162598, "epoch": 2.4730392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.05307041611124714, "kl": 0.08682794123888016, "learning_rate": 9.258058657152761e-08, "loss": 0.0008, "num_tokens": 63697975.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000096082687378, "sampling/importance_sampling_ratio/min": 0.6017568111419678, "sampling/sampling_logp_difference/max": 0.841266393661499, "sampling/sampling_logp_difference/mean": 0.012608527205884457, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.3037737011909485, "epoch": 2.474264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.0234115255557776, "kl": 0.18698279559612274, "learning_rate": 9.216804280694612e-08, "loss": -0.0039, "num_tokens": 63726383.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9514977931976318, "sampling/importance_sampling_ratio/mean": 0.999690055847168, "sampling/importance_sampling_ratio/min": 0.5022098422050476, "sampling/sampling_logp_difference/max": 0.6887372732162476, "sampling/sampling_logp_difference/mean": 0.015550685115158558, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 206.421875, "completions/mean_terminated_length": 206.421875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.23022770881652832, "epoch": 2.4754901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.2283473709930766, "kl": 0.07669369131326675, "learning_rate": 9.175632691540064e-08, "loss": 0.0925, "num_tokens": 63760714.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.64573335647583, "sampling/importance_sampling_ratio/mean": 0.9996827840805054, "sampling/importance_sampling_ratio/min": 0.5268608331680298, "sampling/sampling_logp_difference/max": 0.6408188343048096, "sampling/sampling_logp_difference/mean": 0.014048980548977852, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 133.34375, "completions/mean_terminated_length": 133.34375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.19566136598587036, "epoch": 2.4767156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 2.1620033596651016, "kl": 0.07578049600124359, "learning_rate": 9.134543973264868e-08, "loss": 0.0124, "num_tokens": 63780688.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5518295764923096, "sampling/importance_sampling_ratio/mean": 0.9996536374092102, "sampling/importance_sampling_ratio/min": 0.5661754608154297, "sampling/sampling_logp_difference/max": 0.5688512325286865, "sampling/sampling_logp_difference/mean": 0.013932591304183006, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 212.421875, "completions/mean_terminated_length": 212.421875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2625789940357208, "epoch": 2.4779411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.3469507053829686, "kl": 0.08934664726257324, "learning_rate": 9.093538209276486e-08, "loss": -0.0173, "num_tokens": 63810011.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.8731153011322021, "sampling/importance_sampling_ratio/mean": 0.9999089241027832, "sampling/importance_sampling_ratio/min": 0.5131204724311829, "sampling/sampling_logp_difference/max": 0.6672446727752686, "sampling/sampling_logp_difference/mean": 0.014829147607088089, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 217.03125, "completions/mean_terminated_length": 217.03125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2258068323135376, "epoch": 2.4791666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.04342160452499901, "kl": 0.08009222894906998, "learning_rate": 9.052615482814069e-08, "loss": 0.0007, "num_tokens": 63846445.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7562291622161865, "sampling/importance_sampling_ratio/mean": 0.9999803900718689, "sampling/importance_sampling_ratio/min": 0.48317864537239075, "sampling/sampling_logp_difference/max": 0.7273688316345215, "sampling/sampling_logp_difference/mean": 0.012380572967231274, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.29769203066825867, "epoch": 2.480392156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.532320422530228, "kl": 0.12566164135932922, "learning_rate": 9.011775876948096e-08, "loss": 0.0733, "num_tokens": 63876381.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999967217445374, "sampling/importance_sampling_ratio/min": 0.5362070202827454, "sampling/sampling_logp_difference/max": 0.710334300994873, "sampling/sampling_logp_difference/mean": 0.01466565765440464, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 180.265625, "completions/mean_terminated_length": 180.265625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.267097145318985, "epoch": 2.4816176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.920288812577845, "kl": 0.12502649426460266, "learning_rate": 8.971019474580427e-08, "loss": 0.0031, "num_tokens": 63901406.0, "reward": 0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5178194046020508, "sampling/importance_sampling_ratio/mean": 1.0002391338348389, "sampling/importance_sampling_ratio/min": 0.6014291048049927, "sampling/sampling_logp_difference/max": 0.5084466934204102, "sampling/sampling_logp_difference/mean": 0.015463524498045444, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 179.734375, "completions/mean_terminated_length": 179.734375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2664422392845154, "epoch": 2.482843137254902, "frac_reward_zero_std": 0.25, "grad_norm": 2.5975967423978235, "kl": 0.10327211022377014, "learning_rate": 8.930346358443953e-08, "loss": 0.0818, "num_tokens": 63927581.0, "reward": -0.03125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4618009328842163, "sampling/importance_sampling_ratio/mean": 0.9999598264694214, "sampling/importance_sampling_ratio/min": 0.4522865414619446, "sampling/sampling_logp_difference/max": 0.7934393882751465, "sampling/sampling_logp_difference/mean": 0.01469662319868803, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 163.984375, "completions/mean_terminated_length": 163.984375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.2569267749786377, "epoch": 2.4840686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.08712411280874989, "kl": 0.10711999982595444, "learning_rate": 8.889756611102539e-08, "loss": 0.0011, "num_tokens": 63952604.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.630460500717163, "sampling/importance_sampling_ratio/mean": 0.9999353289604187, "sampling/importance_sampling_ratio/min": 0.46057137846946716, "sampling/sampling_logp_difference/max": 0.775287389755249, "sampling/sampling_logp_difference/mean": 0.016630226746201515, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 202.3125, "completions/mean_terminated_length": 202.3125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2259894609451294, "epoch": 2.485294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.228854813862446, "kl": 0.05850193649530411, "learning_rate": 8.84925031495079e-08, "loss": -0.0875, "num_tokens": 63982752.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6272915601730347, "sampling/importance_sampling_ratio/mean": 1.0000848770141602, "sampling/importance_sampling_ratio/min": 0.47421547770500183, "sampling/sampling_logp_difference/max": 0.7460935115814209, "sampling/sampling_logp_difference/mean": 0.01230490393936634, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 174.609375, "completions/mean_terminated_length": 174.609375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2859165668487549, "epoch": 2.486519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 2.0031900250778567, "kl": 0.11168570816516876, "learning_rate": 8.808827552213916e-08, "loss": -0.0181, "num_tokens": 64008311.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999197125434875, "sampling/importance_sampling_ratio/min": 0.5540803074836731, "sampling/sampling_logp_difference/max": 0.6932724714279175, "sampling/sampling_logp_difference/mean": 0.01680031418800354, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 141.59375, "completions/mean_terminated_length": 141.59375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.246006578207016, "epoch": 2.4877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.05821484953334174, "kl": 0.08178937435150146, "learning_rate": 8.768488404947593e-08, "loss": 0.0008, "num_tokens": 64033757.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.640612244606018, "sampling/importance_sampling_ratio/mean": 0.9998109340667725, "sampling/importance_sampling_ratio/min": 0.5501823425292969, "sampling/sampling_logp_difference/max": 0.5975055694580078, "sampling/sampling_logp_difference/mean": 0.015690255910158157, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 196.015625, "completions/mean_terminated_length": 196.015625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.24520564079284668, "epoch": 2.488970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.042374916409275605, "kl": 0.06653556227684021, "learning_rate": 8.728232955037696e-08, "loss": 0.0006, "num_tokens": 64063710.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7712163925170898, "sampling/importance_sampling_ratio/mean": 1.0000073909759521, "sampling/importance_sampling_ratio/min": 0.5895556807518005, "sampling/sampling_logp_difference/max": 0.5716664791107178, "sampling/sampling_logp_difference/mean": 0.01448611356317997, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 188.828125, "completions/mean_terminated_length": 188.828125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.16599786281585693, "epoch": 2.4901960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.07786440309286723, "kl": 0.05656210705637932, "learning_rate": 8.688061284200265e-08, "loss": 0.0006, "num_tokens": 64094675.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007353067398071, "sampling/importance_sampling_ratio/mean": 1.0000226497650146, "sampling/importance_sampling_ratio/min": 0.319061279296875, "sampling/sampling_logp_difference/max": 1.1423721313476562, "sampling/sampling_logp_difference/mean": 0.010407873429358006, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 222.3125, "completions/mean_terminated_length": 222.3125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2846994698047638, "epoch": 2.491421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.8236398880872422, "kl": 0.16061674058437347, "learning_rate": 8.647973473981224e-08, "loss": -0.0055, "num_tokens": 64128551.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 0.9995661973953247, "sampling/importance_sampling_ratio/min": 0.4824768006801605, "sampling/sampling_logp_difference/max": 0.7288224697113037, "sampling/sampling_logp_difference/mean": 0.015534237027168274, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 172.09375, "completions/mean_terminated_length": 172.09375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.21926692128181458, "epoch": 2.4926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.04258034793347578, "kl": 0.05456060916185379, "learning_rate": 8.607969605756315e-08, "loss": 0.0006, "num_tokens": 64157933.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999468207359314, "sampling/importance_sampling_ratio/min": 0.5684303045272827, "sampling/sampling_logp_difference/max": 0.986548900604248, "sampling/sampling_logp_difference/mean": 0.013844440691173077, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 225.171875, "completions/mean_terminated_length": 225.171875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2624910771846771, "epoch": 2.493872549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.7775404778616285, "kl": 0.07403460144996643, "learning_rate": 8.568049760730838e-08, "loss": 0.0044, "num_tokens": 64193496.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6019309759140015, "sampling/importance_sampling_ratio/mean": 1.0007879734039307, "sampling/importance_sampling_ratio/min": 0.552937924861908, "sampling/sampling_logp_difference/max": 0.5925095081329346, "sampling/sampling_logp_difference/mean": 0.014647210016846657, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 176.640625, "completions/mean_terminated_length": 176.640625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3022158443927765, "epoch": 2.4950980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.992703879495102, "kl": 0.10479722917079926, "learning_rate": 8.52821401993955e-08, "loss": -0.0042, "num_tokens": 64223681.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010337829589844, "sampling/importance_sampling_ratio/min": 0.6119037866592407, "sampling/sampling_logp_difference/max": 0.8013436794281006, "sampling/sampling_logp_difference/mean": 0.016235051676630974, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 156.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.20266065001487732, "epoch": 2.4963235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.04784982307143272, "kl": 0.05678699165582657, "learning_rate": 8.488462464246493e-08, "loss": 0.0006, "num_tokens": 64253953.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.720233678817749, "sampling/importance_sampling_ratio/mean": 0.9993163347244263, "sampling/importance_sampling_ratio/min": 0.6155776381492615, "sampling/sampling_logp_difference/max": 0.5424602031707764, "sampling/sampling_logp_difference/mean": 0.01326032169163227, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 152.203125, "completions/mean_terminated_length": 152.203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.18145807087421417, "epoch": 2.497549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.4403634850359908, "kl": 0.07998235523700714, "learning_rate": 8.448795174344803e-08, "loss": -0.0118, "num_tokens": 64281198.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.651445984840393, "sampling/importance_sampling_ratio/mean": 1.0006821155548096, "sampling/importance_sampling_ratio/min": 0.639241635799408, "sampling/sampling_logp_difference/max": 0.5016512870788574, "sampling/sampling_logp_difference/mean": 0.012963157147169113, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 150.546875, "completions/mean_terminated_length": 150.546875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.22763216495513916, "epoch": 2.498774509803922, "frac_reward_zero_std": 0.5, "grad_norm": 2.260811076751465, "kl": 0.08246447145938873, "learning_rate": 8.409212230756563e-08, "loss": 0.0014, "num_tokens": 64305345.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6254801750183105, "sampling/importance_sampling_ratio/mean": 1.0008478164672852, "sampling/importance_sampling_ratio/min": 0.5552654266357422, "sampling/sampling_logp_difference/max": 0.5883090496063232, "sampling/sampling_logp_difference/mean": 0.013004804030060768, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 208.71875, "completions/mean_terminated_length": 208.71875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2567097246646881, "epoch": 2.5, "frac_reward_zero_std": 0.5, "grad_norm": 1.8692939451667037, "kl": 0.09157007932662964, "learning_rate": 8.369713713832622e-08, "loss": -0.0186, "num_tokens": 64338159.0, "reward": -0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001546144485474, "sampling/importance_sampling_ratio/min": 0.4440856873989105, "sampling/sampling_logp_difference/max": 0.9236316680908203, "sampling/sampling_logp_difference/mean": 0.014840789139270782, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 154.296875, "completions/mean_terminated_length": 154.296875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.28439900279045105, "epoch": 2.501225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.6465117330095138, "kl": 0.12006236612796783, "learning_rate": 8.330299703752497e-08, "loss": 0.0087, "num_tokens": 64368914.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5714937448501587, "sampling/importance_sampling_ratio/mean": 0.999998152256012, "sampling/importance_sampling_ratio/min": 0.5982145667076111, "sampling/sampling_logp_difference/max": 0.5138057470321655, "sampling/sampling_logp_difference/mean": 0.01627548784017563, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 202.078125, "completions/mean_terminated_length": 202.078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2615170478820801, "epoch": 2.502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.383059434915081, "kl": 0.07800057530403137, "learning_rate": 8.290970280524124e-08, "loss": 0.0182, "num_tokens": 64396679.0, "reward": -0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.7811089754104614, "sampling/importance_sampling_ratio/mean": 0.9999487996101379, "sampling/importance_sampling_ratio/min": 0.37332433462142944, "sampling/sampling_logp_difference/max": 0.9853076934814453, "sampling/sampling_logp_difference/mean": 0.014414073899388313, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 145.546875, "completions/mean_terminated_length": 145.546875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.18265660107135773, "epoch": 2.5036764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.5947788116146797, "kl": 0.08648387342691422, "learning_rate": 8.251725523983722e-08, "loss": -0.0145, "num_tokens": 64421530.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.7496325969696045, "sampling/importance_sampling_ratio/mean": 1.0000611543655396, "sampling/importance_sampling_ratio/min": 0.25419026613235474, "sampling/sampling_logp_difference/max": 1.369672179222107, "sampling/sampling_logp_difference/mean": 0.012066777795553207, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2852139472961426, "epoch": 2.5049019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.1845940609833618, "kl": 0.1319739818572998, "learning_rate": 8.212565513795683e-08, "loss": 0.0052, "num_tokens": 64454354.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5439215898513794, "sampling/importance_sampling_ratio/mean": 1.0001826286315918, "sampling/importance_sampling_ratio/min": 0.6155157089233398, "sampling/sampling_logp_difference/max": 0.48529481887817383, "sampling/sampling_logp_difference/mean": 0.015069638378918171, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 200.6875, "completions/mean_terminated_length": 200.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.21678279340267181, "epoch": 2.506127450980392, "frac_reward_zero_std": 0.5, "grad_norm": 1.4893007785410823, "kl": 0.05048329383134842, "learning_rate": 8.173490329452343e-08, "loss": 0.042, "num_tokens": 64485134.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4760957956314087, "sampling/importance_sampling_ratio/mean": 0.9999352693557739, "sampling/importance_sampling_ratio/min": 0.610935628414154, "sampling/sampling_logp_difference/max": 0.4927637577056885, "sampling/sampling_logp_difference/mean": 0.012355451472103596, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 161.359375, "completions/mean_terminated_length": 161.359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27877625823020935, "epoch": 2.5073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.059946639291397104, "kl": 0.08982808142900467, "learning_rate": 8.13450005027384e-08, "loss": 0.0009, "num_tokens": 64512773.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.630943775177002, "sampling/importance_sampling_ratio/mean": 0.9997053146362305, "sampling/importance_sampling_ratio/min": 0.3100298345088959, "sampling/sampling_logp_difference/max": 1.1710867881774902, "sampling/sampling_logp_difference/mean": 0.01694381982088089, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 204.859375, "completions/mean_terminated_length": 204.859375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2658664584159851, "epoch": 2.508578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 2.201674207561085, "kl": 0.07240475714206696, "learning_rate": 8.09559475540797e-08, "loss": 0.0524, "num_tokens": 64544076.0, "reward": 0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995499849319458, "sampling/importance_sampling_ratio/min": 0.49081358313560486, "sampling/sampling_logp_difference/max": 0.7116909027099609, "sampling/sampling_logp_difference/mean": 0.015354299917817116, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 167.328125, "completions/mean_terminated_length": 167.328125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.17701229453086853, "epoch": 2.5098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.05025101002355718, "kl": 0.04492766410112381, "learning_rate": 8.056774523830029e-08, "loss": 0.0005, "num_tokens": 64568097.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5922887325286865, "sampling/importance_sampling_ratio/mean": 1.0001029968261719, "sampling/importance_sampling_ratio/min": 0.3485910892486572, "sampling/sampling_logp_difference/max": 1.0538556575775146, "sampling/sampling_logp_difference/mean": 0.011708034202456474, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 186.265625, "completions/mean_terminated_length": 186.265625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.24314233660697937, "epoch": 2.5110294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 1.5858602248940317, "kl": 0.08327071368694305, "learning_rate": 8.018039434342627e-08, "loss": -0.1419, "num_tokens": 64597474.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5546276569366455, "sampling/importance_sampling_ratio/mean": 1.0001580715179443, "sampling/importance_sampling_ratio/min": 0.5519617795944214, "sampling/sampling_logp_difference/max": 0.5942764282226562, "sampling/sampling_logp_difference/mean": 0.01384375523775816, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 170.984375, "completions/mean_terminated_length": 170.984375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2665937542915344, "epoch": 2.5122549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 2.005298918913521, "kl": 0.09226707369089127, "learning_rate": 7.979389565575522e-08, "loss": 0.0282, "num_tokens": 64630401.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002641677856445, "sampling/importance_sampling_ratio/min": 0.5641261339187622, "sampling/sampling_logp_difference/max": 0.8223857879638672, "sampling/sampling_logp_difference/mean": 0.016130205243825912, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 190.421875, "completions/mean_terminated_length": 190.421875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3023907542228699, "epoch": 2.513480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.3700430215733999, "kl": 0.14257675409317017, "learning_rate": 7.940824995985528e-08, "loss": -0.021, "num_tokens": 64658876.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6105904579162598, "sampling/importance_sampling_ratio/mean": 0.9993658065795898, "sampling/importance_sampling_ratio/min": 0.5193053483963013, "sampling/sampling_logp_difference/max": 0.6552631855010986, "sampling/sampling_logp_difference/mean": 0.015207895077764988, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 182.3125, "completions/mean_terminated_length": 182.3125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.22624555230140686, "epoch": 2.514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.04064950583779161, "kl": 0.06253501772880554, "learning_rate": 7.902345803856264e-08, "loss": 0.0006, "num_tokens": 64690304.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.844347596168518, "sampling/importance_sampling_ratio/mean": 1.0002517700195312, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.6121256351470947, "sampling/sampling_logp_difference/mean": 0.013546247966587543, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 194.859375, "completions/mean_terminated_length": 194.859375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.19373953342437744, "epoch": 2.5159313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.1898838684284017, "kl": 0.0509500578045845, "learning_rate": 7.863952067298041e-08, "loss": -0.0043, "num_tokens": 64721047.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004467964172363, "sampling/importance_sampling_ratio/min": 0.5552655458450317, "sampling/sampling_logp_difference/max": 0.9446532726287842, "sampling/sampling_logp_difference/mean": 0.012281032279133797, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 210.875, "completions/mean_terminated_length": 210.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3182130455970764, "epoch": 2.517156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.6178720617889286, "kl": 0.16294899582862854, "learning_rate": 7.825643864247733e-08, "loss": 0.0121, "num_tokens": 64757567.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6558222770690918, "sampling/importance_sampling_ratio/mean": 1.0004451274871826, "sampling/importance_sampling_ratio/min": 0.6287725567817688, "sampling/sampling_logp_difference/max": 0.5042977333068848, "sampling/sampling_logp_difference/mean": 0.01619984209537506, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 168.71875, "completions/mean_terminated_length": 168.71875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2119746059179306, "epoch": 2.5183823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.10989378193671345, "kl": 0.09138666093349457, "learning_rate": 7.787421272468547e-08, "loss": 0.0009, "num_tokens": 64788173.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003702640533447, "sampling/importance_sampling_ratio/min": 0.6110650897026062, "sampling/sampling_logp_difference/max": 1.09181809425354, "sampling/sampling_logp_difference/mean": 0.013934195972979069, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 181.890625, "completions/mean_terminated_length": 181.890625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3089261054992676, "epoch": 2.519607843137255, "frac_reward_zero_std": 0.25, "grad_norm": 2.452386378197396, "kl": 0.09943846613168716, "learning_rate": 7.749284369549952e-08, "loss": 0.0232, "num_tokens": 64815158.0, "reward": 0.3125, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.755895972251892, "sampling/importance_sampling_ratio/mean": 1.000169277191162, "sampling/importance_sampling_ratio/min": 0.34555506706237793, "sampling/sampling_logp_difference/max": 1.062603235244751, "sampling/sampling_logp_difference/mean": 0.015521117486059666, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 159.265625, "completions/mean_terminated_length": 159.265625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.29940080642700195, "epoch": 2.5208333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 2.508780815193096, "kl": 0.10483361780643463, "learning_rate": 7.711233232907399e-08, "loss": 0.0639, "num_tokens": 64843287.0, "reward": 0.625, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6515302658081055, "sampling/importance_sampling_ratio/mean": 0.9991793632507324, "sampling/importance_sampling_ratio/min": 0.6066753268241882, "sampling/sampling_logp_difference/max": 0.5017023086547852, "sampling/sampling_logp_difference/mean": 0.017587456852197647, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 168.3125, "completions/mean_terminated_length": 168.3125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2717065215110779, "epoch": 2.5220588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.6533966641128728, "kl": 0.08779462426900864, "learning_rate": 7.673267939782324e-08, "loss": -0.0128, "num_tokens": 64875995.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.708302617073059, "sampling/importance_sampling_ratio/mean": 0.9995182156562805, "sampling/importance_sampling_ratio/min": 0.27822449803352356, "sampling/sampling_logp_difference/max": 1.2793269157409668, "sampling/sampling_logp_difference/mean": 0.015954112634062767, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 207.78125, "completions/mean_terminated_length": 207.78125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.20165926218032837, "epoch": 2.5232843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.03778193338893196, "kl": 0.06783895939588547, "learning_rate": 7.63538856724184e-08, "loss": 0.0007, "num_tokens": 64910653.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5551292896270752, "sampling/importance_sampling_ratio/mean": 1.0003447532653809, "sampling/importance_sampling_ratio/min": 0.5010187029838562, "sampling/sampling_logp_difference/max": 0.6911118030548096, "sampling/sampling_logp_difference/mean": 0.012874551117420197, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 149.984375, "completions/mean_terminated_length": 149.984375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.20012640953063965, "epoch": 2.5245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.1445528136134168, "kl": 0.07653352618217468, "learning_rate": 7.597595192178702e-08, "loss": 0.0008, "num_tokens": 64936588.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004346370697021, "sampling/importance_sampling_ratio/min": 0.25459831953048706, "sampling/sampling_logp_difference/max": 1.5598270893096924, "sampling/sampling_logp_difference/mean": 0.015426107682287693, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 181.28125, "completions/mean_terminated_length": 181.28125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.22334742546081543, "epoch": 2.525735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.6107262833998872, "kl": 0.07174746692180634, "learning_rate": 7.559887891311046e-08, "loss": 0.01, "num_tokens": 64963470.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6041007041931152, "sampling/importance_sampling_ratio/mean": 1.0001410245895386, "sampling/importance_sampling_ratio/min": 0.6211667060852051, "sampling/sampling_logp_difference/max": 0.47615575790405273, "sampling/sampling_logp_difference/mean": 0.013342998921871185, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 168.4375, "completions/mean_terminated_length": 168.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.29684188961982727, "epoch": 2.5269607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.8925817098884088, "kl": 0.09327543526887894, "learning_rate": 7.522266741182303e-08, "loss": 0.0316, "num_tokens": 64998858.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5071570873260498, "sampling/importance_sampling_ratio/mean": 0.9994641542434692, "sampling/importance_sampling_ratio/min": 0.6080344319343567, "sampling/sampling_logp_difference/max": 0.49752378463745117, "sampling/sampling_logp_difference/mean": 0.015332784503698349, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 202.796875, "completions/mean_terminated_length": 202.796875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.28393346071243286, "epoch": 2.528186274509804, "frac_reward_zero_std": 0.5, "grad_norm": 1.6044857309462037, "kl": 0.09946949034929276, "learning_rate": 7.484731818161049e-08, "loss": -0.0077, "num_tokens": 65026365.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6209652423858643, "sampling/importance_sampling_ratio/mean": 0.9997640252113342, "sampling/importance_sampling_ratio/min": 0.4355246424674988, "sampling/sampling_logp_difference/max": 0.8312039375305176, "sampling/sampling_logp_difference/mean": 0.014704588800668716, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 132.09375, "completions/mean_terminated_length": 132.09375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2502855062484741, "epoch": 2.5294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.12678763176978367, "kl": 0.1325320154428482, "learning_rate": 7.447283198440763e-08, "loss": 0.0013, "num_tokens": 65049427.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.51406991481781, "sampling/importance_sampling_ratio/mean": 1.0000284910202026, "sampling/importance_sampling_ratio/min": 0.6081269383430481, "sampling/sampling_logp_difference/max": 0.4973716735839844, "sampling/sampling_logp_difference/mean": 0.015502297319471836, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 174.203125, "completions/mean_terminated_length": 174.203125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1999911367893219, "epoch": 2.530637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.4215878182256434, "kl": 0.08172202110290527, "learning_rate": 7.409920958039794e-08, "loss": -0.0051, "num_tokens": 65085184.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.700395941734314, "sampling/importance_sampling_ratio/mean": 1.0005156993865967, "sampling/importance_sampling_ratio/min": 0.5967065691947937, "sampling/sampling_logp_difference/max": 0.5308611392974854, "sampling/sampling_logp_difference/mean": 0.011997406370937824, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2532813549041748, "epoch": 2.531862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.3322553538983197, "kl": 0.13121700286865234, "learning_rate": 7.372645172801112e-08, "loss": -0.005, "num_tokens": 65113180.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7093794345855713, "sampling/importance_sampling_ratio/mean": 1.0005707740783691, "sampling/importance_sampling_ratio/min": 0.5436164140701294, "sampling/sampling_logp_difference/max": 0.6095113754272461, "sampling/sampling_logp_difference/mean": 0.014983594417572021, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 200.578125, "completions/mean_terminated_length": 200.578125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.272316038608551, "epoch": 2.5330882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.677935571660197, "kl": 0.08236125856637955, "learning_rate": 7.335455918392219e-08, "loss": 0.0161, "num_tokens": 65146721.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8162810802459717, "sampling/importance_sampling_ratio/mean": 0.9998995065689087, "sampling/importance_sampling_ratio/min": 0.6058658361434937, "sampling/sampling_logp_difference/max": 0.5967910289764404, "sampling/sampling_logp_difference/mean": 0.01491851918399334, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2139989733695984, "epoch": 2.534313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.040576103423437294, "kl": 0.060933761298656464, "learning_rate": 7.29835327030493e-08, "loss": 0.0006, "num_tokens": 65173713.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7566498517990112, "sampling/importance_sampling_ratio/mean": 1.0005521774291992, "sampling/importance_sampling_ratio/min": 0.6256545186042786, "sampling/sampling_logp_difference/max": 0.5634084939956665, "sampling/sampling_logp_difference/mean": 0.013278011232614517, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 195.578125, "completions/mean_terminated_length": 195.578125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.27939218282699585, "epoch": 2.5355392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.8515158369657, "kl": 0.0869421735405922, "learning_rate": 7.261337303855258e-08, "loss": 0.0053, "num_tokens": 65205510.0, "reward": -0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5609079599380493, "sampling/importance_sampling_ratio/mean": 0.9996978044509888, "sampling/importance_sampling_ratio/min": 0.46683257818222046, "sampling/sampling_logp_difference/max": 0.761784553527832, "sampling/sampling_logp_difference/mean": 0.01585400104522705, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 172.8125, "completions/mean_terminated_length": 172.8125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.256533682346344, "epoch": 2.536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 2.0299182589679767, "kl": 0.1013125628232956, "learning_rate": 7.224408094183299e-08, "loss": -0.0153, "num_tokens": 65230906.0, "reward": 0.34375, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.683823585510254, "sampling/importance_sampling_ratio/mean": 0.9999561905860901, "sampling/importance_sampling_ratio/min": 0.3722969591617584, "sampling/sampling_logp_difference/max": 0.9880634546279907, "sampling/sampling_logp_difference/mean": 0.015095336362719536, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 179.953125, "completions/mean_terminated_length": 179.953125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.28137242794036865, "epoch": 2.5379901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.9014808199425561, "kl": 0.11942119896411896, "learning_rate": 7.187565716252991e-08, "loss": 0.0309, "num_tokens": 65257591.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5801266431808472, "sampling/importance_sampling_ratio/mean": 1.0004737377166748, "sampling/importance_sampling_ratio/min": 0.41838201880455017, "sampling/sampling_logp_difference/max": 0.8713603019714355, "sampling/sampling_logp_difference/mean": 0.014653488993644714, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 172.0625, "completions/mean_terminated_length": 172.0625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.20256870985031128, "epoch": 2.5392156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.0938691809454313, "kl": 0.07591012120246887, "learning_rate": 7.150810244852035e-08, "loss": 0.0008, "num_tokens": 65284891.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999947547912598, "sampling/importance_sampling_ratio/min": 0.4757631719112396, "sampling/sampling_logp_difference/max": 0.9235873222351074, "sampling/sampling_logp_difference/mean": 0.013684678822755814, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 166.90625, "completions/mean_terminated_length": 166.90625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26050078868865967, "epoch": 2.5404411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.568693745447588, "kl": 0.09759359061717987, "learning_rate": 7.114141754591691e-08, "loss": -0.0497, "num_tokens": 65314533.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8480740785598755, "sampling/importance_sampling_ratio/mean": 0.9992367029190063, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.6141440868377686, "sampling/sampling_logp_difference/mean": 0.015216910280287266, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 142.984375, "completions/mean_terminated_length": 142.984375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.18746830523014069, "epoch": 2.5416666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.11363255287841326, "kl": 0.06820593774318695, "learning_rate": 7.077560319906694e-08, "loss": 0.0007, "num_tokens": 65340836.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5502012968063354, "sampling/importance_sampling_ratio/mean": 0.9999443292617798, "sampling/importance_sampling_ratio/min": 0.512789785861969, "sampling/sampling_logp_difference/max": 0.6678893566131592, "sampling/sampling_logp_difference/mean": 0.01332128793001175, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 221.828125, "completions/mean_terminated_length": 221.828125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.20960500836372375, "epoch": 2.542892156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.2641235618361277, "kl": 0.06737308204174042, "learning_rate": 7.041066015055036e-08, "loss": -0.0235, "num_tokens": 65374761.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5975921154022217, "sampling/importance_sampling_ratio/mean": 0.9998186826705933, "sampling/importance_sampling_ratio/min": 0.48500892519950867, "sampling/sampling_logp_difference/max": 0.7235879898071289, "sampling/sampling_logp_difference/mean": 0.012469634413719177, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 171.015625, "completions/mean_terminated_length": 171.015625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.20870862901210785, "epoch": 2.5441176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.9795602567650594, "kl": 0.09434500336647034, "learning_rate": 7.004658914117822e-08, "loss": 0.0603, "num_tokens": 65402202.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9099678993225098, "sampling/importance_sampling_ratio/mean": 0.9997029900550842, "sampling/importance_sampling_ratio/min": 0.5328991413116455, "sampling/sampling_logp_difference/max": 0.6470863819122314, "sampling/sampling_logp_difference/mean": 0.013507528230547905, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 149.59375, "completions/mean_terminated_length": 149.59375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22802692651748657, "epoch": 2.545343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.4513327872754587, "kl": 0.10282780975103378, "learning_rate": 6.968339090999186e-08, "loss": -0.0054, "num_tokens": 65430640.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7198556661605835, "sampling/importance_sampling_ratio/mean": 0.9994577169418335, "sampling/importance_sampling_ratio/min": 0.618988037109375, "sampling/sampling_logp_difference/max": 0.5422403812408447, "sampling/sampling_logp_difference/mean": 0.013412565924227238, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 183.5625, "completions/mean_terminated_length": 183.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.32695138454437256, "epoch": 2.5465686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.443674818174446, "kl": 0.13204100728034973, "learning_rate": 6.932106619426064e-08, "loss": -0.0153, "num_tokens": 65462372.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.424768090248108, "sampling/importance_sampling_ratio/mean": 1.000126600265503, "sampling/importance_sampling_ratio/min": 0.44382381439208984, "sampling/sampling_logp_difference/max": 0.8123276233673096, "sampling/sampling_logp_difference/mean": 0.016524365171790123, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 166.796875, "completions/mean_terminated_length": 166.796875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21823406219482422, "epoch": 2.547794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.5937634157364964, "kl": 0.08576545864343643, "learning_rate": 6.895961572948067e-08, "loss": -0.0062, "num_tokens": 65490215.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.7165240049362183, "sampling/importance_sampling_ratio/mean": 0.999908447265625, "sampling/importance_sampling_ratio/min": 0.5912283062934875, "sampling/sampling_logp_difference/max": 0.5403013229370117, "sampling/sampling_logp_difference/mean": 0.014561184681952, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 188.203125, "completions/mean_terminated_length": 188.203125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2530749440193176, "epoch": 2.549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.4109219341926849, "kl": 0.07359974086284637, "learning_rate": 6.859904024937347e-08, "loss": -0.0148, "num_tokens": 65520100.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9106332063674927, "sampling/importance_sampling_ratio/mean": 1.0003867149353027, "sampling/importance_sampling_ratio/min": 0.45605409145355225, "sampling/sampling_logp_difference/max": 0.7851438522338867, "sampling/sampling_logp_difference/mean": 0.013720223680138588, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 236.515625, "completions/mean_terminated_length": 236.515625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.33111363649368286, "epoch": 2.5502450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.3630038149590784, "kl": 0.11825443804264069, "learning_rate": 6.823934048588459e-08, "loss": -0.0437, "num_tokens": 65551669.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5642359256744385, "sampling/importance_sampling_ratio/mean": 0.999976634979248, "sampling/importance_sampling_ratio/min": 0.5727118253707886, "sampling/sampling_logp_difference/max": 0.5573725700378418, "sampling/sampling_logp_difference/mean": 0.016054105013608932, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 183.203125, "completions/mean_terminated_length": 183.203125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3467482924461365, "epoch": 2.5514705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 2.022072626667965, "kl": 0.1452556550502777, "learning_rate": 6.78805171691817e-08, "loss": 0.0233, "num_tokens": 65584066.0, "reward": 0.5625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996230602264404, "sampling/importance_sampling_ratio/min": 0.5155945420265198, "sampling/sampling_logp_difference/max": 0.7049882411956787, "sampling/sampling_logp_difference/mean": 0.017521627247333527, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 210.546875, "completions/mean_terminated_length": 210.546875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3249031603336334, "epoch": 2.5526960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.8277381996354904, "kl": 0.14773723483085632, "learning_rate": 6.752257102765324e-08, "loss": 0.0124, "num_tokens": 65623157.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.9977896213531494, "sampling/importance_sampling_ratio/mean": 1.0001258850097656, "sampling/importance_sampling_ratio/min": 0.5109723210334778, "sampling/sampling_logp_difference/max": 0.6920413970947266, "sampling/sampling_logp_difference/mean": 0.017270749434828758, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 266.40625, "completions/mean_terminated_length": 266.40625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2513664662837982, "epoch": 2.553921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.629798547570488, "kl": 0.08482865989208221, "learning_rate": 6.716550278790739e-08, "loss": 0.0332, "num_tokens": 65661583.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.9655275344848633, "sampling/importance_sampling_ratio/mean": 0.9994052052497864, "sampling/importance_sampling_ratio/min": 0.4171328544616699, "sampling/sampling_logp_difference/max": 0.8743505477905273, "sampling/sampling_logp_difference/mean": 0.01324361003935337, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2828473448753357, "epoch": 2.5551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.5135601968951389, "kl": 0.12092584371566772, "learning_rate": 6.680931317476996e-08, "loss": 0.0367, "num_tokens": 65686951.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 1.0004072189331055, "sampling/importance_sampling_ratio/min": 0.5685153007507324, "sampling/sampling_logp_difference/max": 0.5647270679473877, "sampling/sampling_logp_difference/mean": 0.01583164557814598, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 163.546875, "completions/mean_terminated_length": 163.546875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3796335458755493, "epoch": 2.556372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 2.1911367549192553, "kl": 0.15162840485572815, "learning_rate": 6.645400291128356e-08, "loss": 0.0103, "num_tokens": 65722042.0, "reward": 0.4375, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5306832790374756, "sampling/importance_sampling_ratio/mean": 1.0003660917282104, "sampling/importance_sampling_ratio/min": 0.6159006953239441, "sampling/sampling_logp_difference/max": 0.4846695065498352, "sampling/sampling_logp_difference/mean": 0.019542595371603966, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 234.78125, "completions/mean_terminated_length": 234.78125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3299919366836548, "epoch": 2.5575980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.5794549794020731, "kl": 0.086224764585495, "learning_rate": 6.609957271870503e-08, "loss": -0.0126, "num_tokens": 65757164.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6277685165405273, "sampling/importance_sampling_ratio/mean": 0.9999915957450867, "sampling/importance_sampling_ratio/min": 0.3673675060272217, "sampling/sampling_logp_difference/max": 1.0013926029205322, "sampling/sampling_logp_difference/mean": 0.015792466700077057, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 194.546875, "completions/mean_terminated_length": 194.546875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.19744019210338593, "epoch": 2.5588235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.028118563136091786, "kl": 0.053611598908901215, "learning_rate": 6.574602331650559e-08, "loss": 0.0005, "num_tokens": 65785839.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7589657306671143, "sampling/importance_sampling_ratio/mean": 1.0012531280517578, "sampling/importance_sampling_ratio/min": 0.5676552653312683, "sampling/sampling_logp_difference/max": 0.5662409067153931, "sampling/sampling_logp_difference/mean": 0.01284267008304596, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 163.84375, "completions/mean_terminated_length": 163.84375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.25681692361831665, "epoch": 2.560049019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.283364651688548, "kl": 0.10075172781944275, "learning_rate": 6.539335542236802e-08, "loss": 0.0091, "num_tokens": 65815237.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996932148933411, "sampling/importance_sampling_ratio/min": 0.5661249160766602, "sampling/sampling_logp_difference/max": 0.8119585514068604, "sampling/sampling_logp_difference/mean": 0.015214763581752777, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 191.46875, "completions/mean_terminated_length": 191.46875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.16612298786640167, "epoch": 2.561274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.055992535310503404, "kl": 0.05680924654006958, "learning_rate": 6.504156975218567e-08, "loss": 0.0005, "num_tokens": 65841731.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5558738708496094, "sampling/importance_sampling_ratio/mean": 0.9993709921836853, "sampling/importance_sampling_ratio/min": 0.3743380606174469, "sampling/sampling_logp_difference/max": 0.9825959205627441, "sampling/sampling_logp_difference/mean": 0.011070625856518745, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.28278350830078125, "epoch": 2.5625, "frac_reward_zero_std": 0.5, "grad_norm": 1.7593979467452114, "kl": 0.07036584615707397, "learning_rate": 6.469066702006137e-08, "loss": -0.1094, "num_tokens": 65873643.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7612903118133545, "sampling/importance_sampling_ratio/mean": 0.9999133944511414, "sampling/importance_sampling_ratio/min": 0.2949601709842682, "sampling/sampling_logp_difference/max": 1.2209149599075317, "sampling/sampling_logp_difference/mean": 0.01508941687643528, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 196.578125, "completions/mean_terminated_length": 196.578125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3417613208293915, "epoch": 2.563725490196078, "frac_reward_zero_std": 0.5, "grad_norm": 1.828518664209619, "kl": 0.10849667340517044, "learning_rate": 6.43406479383053e-08, "loss": 0.0076, "num_tokens": 65901568.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.965592861175537, "sampling/importance_sampling_ratio/mean": 1.00014328956604, "sampling/importance_sampling_ratio/min": 0.6171426773071289, "sampling/sampling_logp_difference/max": 0.6757938861846924, "sampling/sampling_logp_difference/mean": 0.01746380887925625, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 141.234375, "completions/mean_terminated_length": 141.234375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.19732163846492767, "epoch": 2.564950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.7564960077024474, "kl": 0.08417364954948425, "learning_rate": 6.399151321743423e-08, "loss": 0.0272, "num_tokens": 65921871.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6207635402679443, "sampling/importance_sampling_ratio/mean": 0.9992195963859558, "sampling/importance_sampling_ratio/min": 0.591346025466919, "sampling/sampling_logp_difference/max": 0.5253539085388184, "sampling/sampling_logp_difference/mean": 0.014043517410755157, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 179.21875, "completions/mean_terminated_length": 179.21875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.242929607629776, "epoch": 2.5661764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.3703714253707795, "kl": 0.07308401167392731, "learning_rate": 6.364326356616917e-08, "loss": -0.0153, "num_tokens": 65958541.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8151217699050903, "sampling/importance_sampling_ratio/mean": 0.9997493028640747, "sampling/importance_sampling_ratio/min": 0.45788392424583435, "sampling/sampling_logp_difference/max": 0.781139612197876, "sampling/sampling_logp_difference/mean": 0.01518308650702238, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 171.65625, "completions/mean_terminated_length": 171.65625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.20393404364585876, "epoch": 2.5674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.044885106882471194, "kl": 0.076746866106987, "learning_rate": 6.329589969143517e-08, "loss": 0.0007, "num_tokens": 65987543.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9099587202072144, "sampling/importance_sampling_ratio/mean": 1.0000215768814087, "sampling/importance_sampling_ratio/min": 0.5633079409599304, "sampling/sampling_logp_difference/max": 0.6470816135406494, "sampling/sampling_logp_difference/mean": 0.012605559080839157, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 202.546875, "completions/mean_terminated_length": 202.546875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.27725207805633545, "epoch": 2.568627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 1.1611957430125468, "kl": 0.08803446590900421, "learning_rate": 6.29494222983587e-08, "loss": 0.038, "num_tokens": 66025834.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999281167984009, "sampling/importance_sampling_ratio/min": 0.3211215138435364, "sampling/sampling_logp_difference/max": 1.135935664176941, "sampling/sampling_logp_difference/mean": 0.015521700493991375, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 203.46875, "completions/mean_terminated_length": 203.46875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.28914710879325867, "epoch": 2.5698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.2299099073108812, "kl": 0.09882599115371704, "learning_rate": 6.260383209026704e-08, "loss": 0.0148, "num_tokens": 66059304.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9051291942596436, "sampling/importance_sampling_ratio/mean": 1.0000487565994263, "sampling/importance_sampling_ratio/min": 0.5375722050666809, "sampling/sampling_logp_difference/max": 0.6445498466491699, "sampling/sampling_logp_difference/mean": 0.015876304358243942, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 203.546875, "completions/mean_terminated_length": 203.546875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3735591471195221, "epoch": 2.571078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.7435414209196602, "kl": 0.1381000429391861, "learning_rate": 6.225912976868636e-08, "loss": 0.0536, "num_tokens": 66092219.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005311965942383, "sampling/importance_sampling_ratio/min": 0.29176315665245056, "sampling/sampling_logp_difference/max": 1.2318129539489746, "sampling/sampling_logp_difference/mean": 0.020947258919477463, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 208.609375, "completions/mean_terminated_length": 208.609375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2167380303144455, "epoch": 2.5723039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.4442468872495544, "kl": 0.11169961839914322, "learning_rate": 6.191531603334044e-08, "loss": -0.0223, "num_tokens": 66120562.0, "reward": 0.4375, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6635867357254028, "sampling/importance_sampling_ratio/mean": 1.0005972385406494, "sampling/importance_sampling_ratio/min": 0.6209225654602051, "sampling/sampling_logp_difference/max": 0.5089759826660156, "sampling/sampling_logp_difference/mean": 0.012378549203276634, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 239.234375, "completions/mean_terminated_length": 239.234375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.20222190022468567, "epoch": 2.5735294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 1.239160434612888, "kl": 0.04628078266978264, "learning_rate": 6.157239158214966e-08, "loss": -0.011, "num_tokens": 66158977.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000662803649902, "sampling/importance_sampling_ratio/min": 0.4699024558067322, "sampling/sampling_logp_difference/max": 0.8504469394683838, "sampling/sampling_logp_difference/mean": 0.012522635981440544, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 190.765625, "completions/mean_terminated_length": 190.765625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2047121524810791, "epoch": 2.5747549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.08701466942171444, "kl": 0.08167508244514465, "learning_rate": 6.123035711122859e-08, "loss": 0.0009, "num_tokens": 66189074.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.649046778678894, "sampling/importance_sampling_ratio/mean": 0.9988332986831665, "sampling/importance_sampling_ratio/min": 0.4462607800960541, "sampling/sampling_logp_difference/max": 0.806851863861084, "sampling/sampling_logp_difference/mean": 0.01280701719224453, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 209.828125, "completions/mean_terminated_length": 209.828125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2900751829147339, "epoch": 2.575980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.8466726522520487, "kl": 0.12072841823101044, "learning_rate": 6.088921331488566e-08, "loss": 0.048, "num_tokens": 66219319.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.7802555561065674, "sampling/importance_sampling_ratio/mean": 0.9999192953109741, "sampling/importance_sampling_ratio/min": 0.4764772057533264, "sampling/sampling_logp_difference/max": 0.741335391998291, "sampling/sampling_logp_difference/mean": 0.015741858631372452, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 174.03125, "completions/mean_terminated_length": 174.03125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1932278871536255, "epoch": 2.577205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.8835831160264318, "kl": 0.07695497572422028, "learning_rate": 6.05489608856214e-08, "loss": 0.0817, "num_tokens": 66247353.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.848081111907959, "sampling/importance_sampling_ratio/mean": 0.9999405145645142, "sampling/importance_sampling_ratio/min": 0.48282214999198914, "sampling/sampling_logp_difference/max": 0.7281069159507751, "sampling/sampling_logp_difference/mean": 0.012569449841976166, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 188.6875, "completions/mean_terminated_length": 188.6875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.23454561829566956, "epoch": 2.5784313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.08303032723481436, "kl": 0.09593294560909271, "learning_rate": 6.020960051412638e-08, "loss": 0.0009, "num_tokens": 66276421.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000700950622559, "sampling/importance_sampling_ratio/min": 0.5919349193572998, "sampling/sampling_logp_difference/max": 0.7266454696655273, "sampling/sampling_logp_difference/mean": 0.01347368024289608, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 194.78125, "completions/mean_terminated_length": 194.78125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2750077247619629, "epoch": 2.579656862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.9504997349472788, "kl": 0.10330727696418762, "learning_rate": 5.98711328892808e-08, "loss": 0.0622, "num_tokens": 66307863.0, "reward": 0.65625, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.458220362663269, "sampling/importance_sampling_ratio/mean": 0.9996346235275269, "sampling/importance_sampling_ratio/min": 0.416096568107605, "sampling/sampling_logp_difference/max": 0.8768379092216492, "sampling/sampling_logp_difference/mean": 0.015241402201354504, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 213.375, "completions/mean_terminated_length": 213.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.30214518308639526, "epoch": 2.5808823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.806112724407539, "kl": 0.1500733196735382, "learning_rate": 5.9533558698152355e-08, "loss": -0.0382, "num_tokens": 66340031.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5259431600570679, "sampling/importance_sampling_ratio/mean": 0.9996534585952759, "sampling/importance_sampling_ratio/min": 0.5610695481300354, "sampling/sampling_logp_difference/max": 0.5779104232788086, "sampling/sampling_logp_difference/mean": 0.015558110550045967, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 259.140625, "completions/mean_terminated_length": 259.140625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.31714487075805664, "epoch": 2.582107843137255, "frac_reward_zero_std": 0.0, "grad_norm": 1.9515772424116236, "kl": 0.1299181431531906, "learning_rate": 5.919687862599548e-08, "loss": 0.0353, "num_tokens": 66375448.0, "reward": 0.71875, "reward_std": 0.6751632690429688, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5271198749542236, "sampling/importance_sampling_ratio/mean": 0.9996883869171143, "sampling/importance_sampling_ratio/min": 0.5868014693260193, "sampling/sampling_logp_difference/max": 0.5330686569213867, "sampling/sampling_logp_difference/mean": 0.015624091029167175, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 173.65625, "completions/mean_terminated_length": 173.65625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2470850646495819, "epoch": 2.5833333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.045180226644925776, "kl": 0.08131169527769089, "learning_rate": 5.886109335624928e-08, "loss": 0.0008, "num_tokens": 66406498.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6108070611953735, "sampling/importance_sampling_ratio/mean": 0.9997807741165161, "sampling/importance_sampling_ratio/min": 0.6077000498771667, "sampling/sampling_logp_difference/max": 0.4980738162994385, "sampling/sampling_logp_difference/mean": 0.01548383105546236, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 135.265625, "completions/mean_terminated_length": 135.265625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.17381243407726288, "epoch": 2.5845588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.07604536003009166, "kl": 0.06403857469558716, "learning_rate": 5.8526203570536504e-08, "loss": 0.0006, "num_tokens": 66427587.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.574164867401123, "sampling/importance_sampling_ratio/mean": 1.0000239610671997, "sampling/importance_sampling_ratio/min": 0.5934368968009949, "sampling/sampling_logp_difference/max": 0.5218243598937988, "sampling/sampling_logp_difference/mean": 0.012833647429943085, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 135.96875, "completions/mean_terminated_length": 135.96875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.22191357612609863, "epoch": 2.5857843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.059451595488596576, "kl": 0.0778200775384903, "learning_rate": 5.819220994866236e-08, "loss": 0.0008, "num_tokens": 66451377.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5200186967849731, "sampling/importance_sampling_ratio/mean": 0.9999660849571228, "sampling/importance_sampling_ratio/min": 0.6176431775093079, "sampling/sampling_logp_difference/max": 0.481844425201416, "sampling/sampling_logp_difference/mean": 0.013643322512507439, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 167.1875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.25612854957580566, "epoch": 2.5870098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.470913918959459, "kl": 0.08519189059734344, "learning_rate": 5.7859113168612696e-08, "loss": 0.0171, "num_tokens": 66480445.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.838975429534912, "sampling/importance_sampling_ratio/mean": 0.9992935657501221, "sampling/importance_sampling_ratio/min": 0.5243265628814697, "sampling/sampling_logp_difference/max": 0.6456406116485596, "sampling/sampling_logp_difference/mean": 0.014908654615283012, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 221.515625, "completions/mean_terminated_length": 221.515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.24222581088542938, "epoch": 2.588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.04218740059821595, "kl": 0.054951492697000504, "learning_rate": 5.7526913906552786e-08, "loss": 0.0006, "num_tokens": 66520142.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999772846698761, "sampling/importance_sampling_ratio/min": 0.4225720167160034, "sampling/sampling_logp_difference/max": 0.8613954186439514, "sampling/sampling_logp_difference/mean": 0.015231235884130001, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 248.265625, "completions/mean_terminated_length": 248.265625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.2756633162498474, "epoch": 2.5894607843137254, "frac_reward_zero_std": 0.25, "grad_norm": 1.8832164839881007, "kl": 0.08400071412324905, "learning_rate": 5.7195612836826055e-08, "loss": 0.0005, "num_tokens": 66553407.0, "reward": 0.125, "reward_std": 0.6311737298965454, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6103041172027588, "sampling/importance_sampling_ratio/mean": 1.0004600286483765, "sampling/importance_sampling_ratio/min": 0.6165305376052856, "sampling/sampling_logp_difference/max": 0.48364734649658203, "sampling/sampling_logp_difference/mean": 0.013941222801804543, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 184.453125, "completions/mean_terminated_length": 184.453125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3001672625541687, "epoch": 2.590686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 2.1600929794595727, "kl": 0.09254103899002075, "learning_rate": 5.686521063195287e-08, "loss": -0.0046, "num_tokens": 66583676.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.633475422859192, "sampling/importance_sampling_ratio/mean": 0.9997601509094238, "sampling/importance_sampling_ratio/min": 0.5941472053527832, "sampling/sampling_logp_difference/max": 0.5206282138824463, "sampling/sampling_logp_difference/mean": 0.01590839773416519, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 290.515625, "completions/mean_terminated_length": 290.515625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.19757306575775146, "epoch": 2.5919117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.8962636414866241, "kl": 0.07685651630163193, "learning_rate": 5.6535707962628685e-08, "loss": 0.0426, "num_tokens": 66625757.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6202460527420044, "sampling/importance_sampling_ratio/mean": 1.0001763105392456, "sampling/importance_sampling_ratio/min": 0.1880382001399994, "sampling/sampling_logp_difference/max": 1.6711101531982422, "sampling/sampling_logp_difference/mean": 0.012390850111842155, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 159.40625, "completions/mean_terminated_length": 159.40625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.269588440656662, "epoch": 2.593137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.0103775455000308, "kl": 0.10725459456443787, "learning_rate": 5.620710549772295e-08, "loss": 0.0092, "num_tokens": 66658983.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6176186800003052, "sampling/importance_sampling_ratio/mean": 0.9991856813430786, "sampling/importance_sampling_ratio/min": 0.1900569498538971, "sampling/sampling_logp_difference/max": 1.6604315042495728, "sampling/sampling_logp_difference/mean": 0.016164135187864304, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2638397812843323, "epoch": 2.594362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.1614219117302582, "kl": 0.10906018316745758, "learning_rate": 5.5879403904278034e-08, "loss": -0.0004, "num_tokens": 66683711.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.656031608581543, "sampling/importance_sampling_ratio/mean": 1.000084638595581, "sampling/importance_sampling_ratio/min": 0.5363211631774902, "sampling/sampling_logp_difference/max": 0.6230220794677734, "sampling/sampling_logp_difference/mean": 0.015072275884449482, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 171.96875, "completions/mean_terminated_length": 171.96875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2775786221027374, "epoch": 2.5955882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.6394531574709035, "kl": 0.13202057778835297, "learning_rate": 5.555260384750721e-08, "loss": -0.0068, "num_tokens": 66710525.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8529046773910522, "sampling/importance_sampling_ratio/mean": 0.9998239278793335, "sampling/importance_sampling_ratio/min": 0.5499146580696106, "sampling/sampling_logp_difference/max": 0.6167545318603516, "sampling/sampling_logp_difference/mean": 0.01635042577981949, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 204.78125, "completions/mean_terminated_length": 204.78125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.22385014593601227, "epoch": 2.596813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.06418912554396708, "kl": 0.07331070303916931, "learning_rate": 5.5226705990794156e-08, "loss": 0.0008, "num_tokens": 66747983.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6319586038589478, "sampling/importance_sampling_ratio/mean": 1.0003552436828613, "sampling/importance_sampling_ratio/min": 0.5362968444824219, "sampling/sampling_logp_difference/max": 0.6230674982070923, "sampling/sampling_logp_difference/mean": 0.014345422387123108, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 221.546875, "completions/mean_terminated_length": 221.546875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2747361958026886, "epoch": 2.5980392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.686704366115504, "kl": 0.08461602032184601, "learning_rate": 5.4901710995690576e-08, "loss": -0.0303, "num_tokens": 66780162.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.770574927330017, "sampling/importance_sampling_ratio/mean": 0.9999843835830688, "sampling/importance_sampling_ratio/min": 0.623648464679718, "sampling/sampling_logp_difference/max": 0.5713043212890625, "sampling/sampling_logp_difference/mean": 0.015064763836562634, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 197.3125, "completions/mean_terminated_length": 197.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.26871490478515625, "epoch": 2.599264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0842918697112692, "kl": 0.09573175013065338, "learning_rate": 5.4577619521915916e-08, "loss": 0.0009, "num_tokens": 66811302.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5764344930648804, "sampling/importance_sampling_ratio/mean": 1.0004682540893555, "sampling/importance_sampling_ratio/min": 0.5224652290344238, "sampling/sampling_logp_difference/max": 0.6491968631744385, "sampling/sampling_logp_difference/mean": 0.01558225229382515, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 200.390625, "completions/mean_terminated_length": 200.390625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.25763362646102905, "epoch": 2.6004901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.9315214618315621, "kl": 0.10824404656887054, "learning_rate": 5.425443222735526e-08, "loss": -0.02, "num_tokens": 66839839.0, "reward": 0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5747989416122437, "sampling/importance_sampling_ratio/mean": 1.0000081062316895, "sampling/importance_sampling_ratio/min": 0.5106753706932068, "sampling/sampling_logp_difference/max": 0.6720211505889893, "sampling/sampling_logp_difference/mean": 0.014287374913692474, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 205.09375, "completions/mean_terminated_length": 205.09375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3279130756855011, "epoch": 2.6017156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 1.9649989694015835, "kl": 0.0774766206741333, "learning_rate": 5.393214976805832e-08, "loss": -0.0067, "num_tokens": 66875301.0, "reward": 0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5310070514678955, "sampling/importance_sampling_ratio/mean": 0.9998342990875244, "sampling/importance_sampling_ratio/min": 0.6203033924102783, "sampling/sampling_logp_difference/max": 0.47754669189453125, "sampling/sampling_logp_difference/mean": 0.018412724137306213, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 176.765625, "completions/mean_terminated_length": 176.765625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.28056013584136963, "epoch": 2.6029411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.0256661287488673, "kl": 0.08669671416282654, "learning_rate": 5.361077279823817e-08, "loss": -0.0208, "num_tokens": 66903190.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5671733617782593, "sampling/importance_sampling_ratio/mean": 0.9998500347137451, "sampling/importance_sampling_ratio/min": 0.5264557003974915, "sampling/sampling_logp_difference/max": 0.6415880918502808, "sampling/sampling_logp_difference/mean": 0.015238385647535324, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 266.203125, "completions/mean_terminated_length": 266.203125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.24432271718978882, "epoch": 2.6041666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 1.0124809033525424, "kl": 0.07161411643028259, "learning_rate": 5.3290301970269514e-08, "loss": 0.0092, "num_tokens": 66937155.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007449388504028, "sampling/importance_sampling_ratio/min": 0.47952455282211304, "sampling/sampling_logp_difference/max": 0.7349601984024048, "sampling/sampling_logp_difference/mean": 0.014381598681211472, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 161.59375, "completions/mean_terminated_length": 161.59375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.24809454381465912, "epoch": 2.605392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.3918587831956422, "kl": 0.0901394635438919, "learning_rate": 5.29707379346882e-08, "loss": 0.0029, "num_tokens": 66962969.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4687564373016357, "sampling/importance_sampling_ratio/mean": 0.9996079206466675, "sampling/importance_sampling_ratio/min": 0.37306836247444153, "sampling/sampling_logp_difference/max": 0.9859936237335205, "sampling/sampling_logp_difference/mean": 0.015450348146259785, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 166.859375, "completions/mean_terminated_length": 166.859375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.23761391639709473, "epoch": 2.6066176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.05433803802921959, "kl": 0.07408708333969116, "learning_rate": 5.2652081340188506e-08, "loss": 0.0007, "num_tokens": 66992624.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6512004137039185, "sampling/importance_sampling_ratio/mean": 1.0000839233398438, "sampling/importance_sampling_ratio/min": 0.3969041109085083, "sampling/sampling_logp_difference/max": 0.924060583114624, "sampling/sampling_logp_difference/mean": 0.014247927814722061, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 206.03125, "completions/mean_terminated_length": 206.03125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22145050764083862, "epoch": 2.607843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.168122173907982, "kl": 0.08317376673221588, "learning_rate": 5.2334332833623487e-08, "loss": -0.0355, "num_tokens": 67026194.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001128911972046, "sampling/importance_sampling_ratio/min": 0.5911355018615723, "sampling/sampling_logp_difference/max": 0.7048373222351074, "sampling/sampling_logp_difference/mean": 0.013085026293992996, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.32461151480674744, "epoch": 2.6090686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.1978703401036495, "kl": 0.10807499289512634, "learning_rate": 5.2017493060002196e-08, "loss": 0.0122, "num_tokens": 67058466.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5546787977218628, "sampling/importance_sampling_ratio/mean": 0.9997225999832153, "sampling/importance_sampling_ratio/min": 0.5561249256134033, "sampling/sampling_logp_difference/max": 0.5867623090744019, "sampling/sampling_logp_difference/mean": 0.017366349697113037, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.24303880333900452, "epoch": 2.610294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.0506152082471976, "kl": 0.07377795875072479, "learning_rate": 5.1701562662489596e-08, "loss": 0.0317, "num_tokens": 67093034.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004481077194214, "sampling/importance_sampling_ratio/min": 0.14816617965698242, "sampling/sampling_logp_difference/max": 1.9094208478927612, "sampling/sampling_logp_difference/mean": 0.015139998868107796, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 238.328125, "completions/mean_terminated_length": 238.328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.27354949712753296, "epoch": 2.611519607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.2904571953305226, "kl": 0.09254096448421478, "learning_rate": 5.138654228240424e-08, "loss": 0.0042, "num_tokens": 67126751.0, "reward": 0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5467699766159058, "sampling/importance_sampling_ratio/mean": 0.9996681809425354, "sampling/importance_sampling_ratio/min": 0.5174968838691711, "sampling/sampling_logp_difference/max": 0.6587517261505127, "sampling/sampling_logp_difference/mean": 0.01584434136748314, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 164.65625, "completions/mean_terminated_length": 164.65625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.24980810284614563, "epoch": 2.6127450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.9901589062946016, "kl": 0.08331011980772018, "learning_rate": 5.1072432559217446e-08, "loss": 0.0142, "num_tokens": 67156057.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8933290243148804, "sampling/importance_sampling_ratio/mean": 0.9995639324188232, "sampling/importance_sampling_ratio/min": 0.5046089291572571, "sampling/sampling_logp_difference/max": 0.683971643447876, "sampling/sampling_logp_difference/mean": 0.015042290091514587, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.24290236830711365, "epoch": 2.6139705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 1.8874488386936594, "kl": 0.12486511468887329, "learning_rate": 5.075923413055222e-08, "loss": 0.03, "num_tokens": 67184825.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.856454849243164, "sampling/importance_sampling_ratio/mean": 1.0002044439315796, "sampling/importance_sampling_ratio/min": 0.6065518856048584, "sampling/sampling_logp_difference/max": 0.6186686754226685, "sampling/sampling_logp_difference/mean": 0.014220327138900757, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 174.953125, "completions/mean_terminated_length": 174.953125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.23684829473495483, "epoch": 2.6151960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.6676110255035501, "kl": 0.10212721675634384, "learning_rate": 5.044694763218149e-08, "loss": -0.0134, "num_tokens": 67212182.0, "reward": 0.0, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5247656106948853, "sampling/importance_sampling_ratio/mean": 1.0001076459884644, "sampling/importance_sampling_ratio/min": 0.6131953001022339, "sampling/sampling_logp_difference/max": 0.4890718460083008, "sampling/sampling_logp_difference/mean": 0.01379355788230896, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 218.296875, "completions/mean_terminated_length": 218.296875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.20745372772216797, "epoch": 2.616421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.7549346864798054, "kl": 0.060086145997047424, "learning_rate": 5.013557369802701e-08, "loss": 0.0199, "num_tokens": 67243673.0, "reward": 0.6875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005900859832764, "sampling/importance_sampling_ratio/min": 0.26832762360572815, "sampling/sampling_logp_difference/max": 1.3155465126037598, "sampling/sampling_logp_difference/mean": 0.012244252488017082, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 176.140625, "completions/mean_terminated_length": 176.140625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.22691315412521362, "epoch": 2.6176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.05290554143748817, "kl": 0.06415960937738419, "learning_rate": 4.982511296015807e-08, "loss": 0.0006, "num_tokens": 67270418.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9223034381866455, "sampling/importance_sampling_ratio/mean": 0.999817430973053, "sampling/importance_sampling_ratio/min": 0.6110133528709412, "sampling/sampling_logp_difference/max": 0.6535241603851318, "sampling/sampling_logp_difference/mean": 0.014248640276491642, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 187.703125, "completions/mean_terminated_length": 187.703125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.276864230632782, "epoch": 2.618872549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.3544910011995401, "kl": 0.08870755136013031, "learning_rate": 4.951556604879048e-08, "loss": 0.0364, "num_tokens": 67298975.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005528926849365, "sampling/importance_sampling_ratio/min": 0.4715724587440491, "sampling/sampling_logp_difference/max": 0.7657084465026855, "sampling/sampling_logp_difference/mean": 0.016868922859430313, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 182.9375, "completions/mean_terminated_length": 182.9375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.2801469564437866, "epoch": 2.6200980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 2.0659936887142005, "kl": 0.1408994495868683, "learning_rate": 4.9206933592284725e-08, "loss": -0.0189, "num_tokens": 67335675.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9737051725387573, "sampling/importance_sampling_ratio/mean": 1.0005115270614624, "sampling/importance_sampling_ratio/min": 0.2446073442697525, "sampling/sampling_logp_difference/max": 1.4081010818481445, "sampling/sampling_logp_difference/mean": 0.01681932434439659, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 212.640625, "completions/mean_terminated_length": 212.640625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.22469255328178406, "epoch": 2.6213235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.378163899528437, "kl": 0.04773491621017456, "learning_rate": 4.889921621714516e-08, "loss": 0.0746, "num_tokens": 67374036.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5277727842330933, "sampling/importance_sampling_ratio/mean": 0.999775230884552, "sampling/importance_sampling_ratio/min": 0.30699262022972107, "sampling/sampling_logp_difference/max": 1.180931568145752, "sampling/sampling_logp_difference/mean": 0.012799415737390518, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 197.40625, "completions/mean_terminated_length": 197.40625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.28447532653808594, "epoch": 2.622549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.1631593950910148, "kl": 0.106813944876194, "learning_rate": 4.859241454801866e-08, "loss": -0.01, "num_tokens": 67407550.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.7253344058990479, "sampling/importance_sampling_ratio/mean": 0.9998149871826172, "sampling/importance_sampling_ratio/min": 0.4956277906894684, "sampling/sampling_logp_difference/max": 0.701930046081543, "sampling/sampling_logp_difference/mean": 0.015472184866666794, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 206.359375, "completions/mean_terminated_length": 206.359375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.1956772357225418, "epoch": 2.623774509803922, "frac_reward_zero_std": 0.75, "grad_norm": 1.050202017905082, "kl": 0.07657797634601593, "learning_rate": 4.828652920769311e-08, "loss": -0.0344, "num_tokens": 67439045.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004371404647827, "sampling/importance_sampling_ratio/min": 0.41190850734710693, "sampling/sampling_logp_difference/max": 0.8869540691375732, "sampling/sampling_logp_difference/mean": 0.012367982417345047, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 160.453125, "completions/mean_terminated_length": 160.453125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1892467588186264, "epoch": 2.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.06270786251956269, "kl": 0.06768421828746796, "learning_rate": 4.7981560817096366e-08, "loss": 0.0007, "num_tokens": 67466226.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6377545595169067, "sampling/importance_sampling_ratio/mean": 0.9995848536491394, "sampling/importance_sampling_ratio/min": 0.6116840243339539, "sampling/sampling_logp_difference/max": 0.49332618713378906, "sampling/sampling_logp_difference/mean": 0.01251022145152092, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 145.671875, "completions/mean_terminated_length": 145.671875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.21630370616912842, "epoch": 2.626225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.8027880141909465, "kl": 0.09206929802894592, "learning_rate": 4.767750999529485e-08, "loss": -0.0267, "num_tokens": 67489805.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.8722227811813354, "sampling/importance_sampling_ratio/mean": 0.999727189540863, "sampling/importance_sampling_ratio/min": 0.6369261741638184, "sampling/sampling_logp_difference/max": 0.6271264553070068, "sampling/sampling_logp_difference/mean": 0.01369383092969656, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.18437042832374573, "epoch": 2.627450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.05353737510970359, "kl": 0.06574147939682007, "learning_rate": 4.7374377359492624e-08, "loss": 0.0006, "num_tokens": 67520741.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999595582485199, "sampling/importance_sampling_ratio/min": 0.40562233328819275, "sampling/sampling_logp_difference/max": 0.9023327827453613, "sampling/sampling_logp_difference/mean": 0.012694522738456726, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 197.84375, "completions/mean_terminated_length": 197.84375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2577444911003113, "epoch": 2.6286764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.4293126999214278, "kl": 0.08104147017002106, "learning_rate": 4.707216352502974e-08, "loss": 0.0182, "num_tokens": 67550139.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6033425331115723, "sampling/importance_sampling_ratio/mean": 0.9998073577880859, "sampling/importance_sampling_ratio/min": 0.5484043955802917, "sampling/sampling_logp_difference/max": 0.6007423400878906, "sampling/sampling_logp_difference/mean": 0.015712305903434753, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 177.921875, "completions/mean_terminated_length": 177.921875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.360873818397522, "epoch": 2.6299019607843137, "frac_reward_zero_std": 0.0, "grad_norm": 2.671935418846424, "kl": 0.14488419890403748, "learning_rate": 4.6770869105380914e-08, "loss": 0.0015, "num_tokens": 67583798.0, "reward": 0.40625, "reward_std": 0.8327301740646362, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8791263103485107, "sampling/importance_sampling_ratio/mean": 0.9996519684791565, "sampling/importance_sampling_ratio/min": 0.5849604606628418, "sampling/sampling_logp_difference/max": 0.6308069229125977, "sampling/sampling_logp_difference/mean": 0.018338389694690704, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 179.234375, "completions/mean_terminated_length": 179.234375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.17377278208732605, "epoch": 2.631127450980392, "frac_reward_zero_std": 0.75, "grad_norm": 1.2666293172358711, "kl": 0.060918159782886505, "learning_rate": 4.647049471215497e-08, "loss": 0.0163, "num_tokens": 67613397.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4857426881790161, "sampling/importance_sampling_ratio/mean": 0.9993095993995667, "sampling/importance_sampling_ratio/min": 0.6075646281242371, "sampling/sampling_logp_difference/max": 0.49829673767089844, "sampling/sampling_logp_difference/mean": 0.011565905064344406, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 165.109375, "completions/mean_terminated_length": 165.109375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2292921543121338, "epoch": 2.6323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.04980462181107673, "kl": 0.09878659248352051, "learning_rate": 4.6171040955092835e-08, "loss": 0.0009, "num_tokens": 67640044.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6528207063674927, "sampling/importance_sampling_ratio/mean": 0.9994211196899414, "sampling/importance_sampling_ratio/min": 0.6100565791130066, "sampling/sampling_logp_difference/max": 0.5024833679199219, "sampling/sampling_logp_difference/mean": 0.013480335474014282, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 186.140625, "completions/mean_terminated_length": 186.140625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.21973447501659393, "epoch": 2.633578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.04874730038388227, "kl": 0.05952436476945877, "learning_rate": 4.587250844206664e-08, "loss": 0.0006, "num_tokens": 67669461.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.80014169216156, "sampling/importance_sampling_ratio/mean": 1.0001519918441772, "sampling/importance_sampling_ratio/min": 0.5886259078979492, "sampling/sampling_logp_difference/max": 0.5878653526306152, "sampling/sampling_logp_difference/mean": 0.013445817865431309, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 178.0, "completions/mean_terminated_length": 178.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.23014162480831146, "epoch": 2.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.053334673910715065, "kl": 0.08652716875076294, "learning_rate": 4.557489777907836e-08, "loss": 0.0008, "num_tokens": 67696789.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971620082855225, "sampling/importance_sampling_ratio/mean": 0.999606728553772, "sampling/importance_sampling_ratio/min": 0.6171416640281677, "sampling/sampling_logp_difference/max": 0.48265671730041504, "sampling/sampling_logp_difference/mean": 0.012588806450366974, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 270.203125, "completions/mean_terminated_length": 270.203125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.1745699644088745, "epoch": 2.6360294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.05844377521284558, "kl": 0.07397204637527466, "learning_rate": 4.527820957025891e-08, "loss": 0.0006, "num_tokens": 67733314.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998288750648499, "sampling/importance_sampling_ratio/min": 0.5096697211265564, "sampling/sampling_logp_difference/max": 0.8388676643371582, "sampling/sampling_logp_difference/mean": 0.010835763067007065, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 163.21875, "completions/mean_terminated_length": 163.21875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2286285012960434, "epoch": 2.6372549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.3840725376123506, "kl": 0.10957164317369461, "learning_rate": 4.498244441786675e-08, "loss": 0.004, "num_tokens": 67759872.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6009145975112915, "sampling/importance_sampling_ratio/mean": 0.9996766448020935, "sampling/importance_sampling_ratio/min": 0.6485008001327515, "sampling/sampling_logp_difference/max": 0.47057509422302246, "sampling/sampling_logp_difference/mean": 0.013533113524317741, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.25470179319381714, "epoch": 2.638480392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.4633047084419286, "kl": 0.11641483008861542, "learning_rate": 4.4687602922286016e-08, "loss": 0.0178, "num_tokens": 67792104.0, "reward": -0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.9625647068023682, "sampling/importance_sampling_ratio/mean": 1.0000993013381958, "sampling/importance_sampling_ratio/min": 0.3522830605506897, "sampling/sampling_logp_difference/max": 1.0433202981948853, "sampling/sampling_logp_difference/mean": 0.015565533190965652, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 206.921875, "completions/mean_terminated_length": 206.921875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.28980010747909546, "epoch": 2.639705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.6208375507681307, "kl": 0.11233049631118774, "learning_rate": 4.4393685682026505e-08, "loss": 0.003, "num_tokens": 67827971.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5966885089874268, "sampling/importance_sampling_ratio/mean": 1.0005667209625244, "sampling/importance_sampling_ratio/min": 0.5170525312423706, "sampling/sampling_logp_difference/max": 0.6596107482910156, "sampling/sampling_logp_difference/mean": 0.015001502819359303, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 182.0625, "completions/mean_terminated_length": 182.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.29670900106430054, "epoch": 2.6409313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.050815895260017105, "kl": 0.109046071767807, "learning_rate": 4.4100693293721516e-08, "loss": 0.0011, "num_tokens": 67855511.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7177762985229492, "sampling/importance_sampling_ratio/mean": 0.9998449087142944, "sampling/importance_sampling_ratio/min": 0.41617515683174133, "sampling/sampling_logp_difference/max": 0.876649022102356, "sampling/sampling_logp_difference/mean": 0.016118215397000313, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 198.421875, "completions/mean_terminated_length": 198.421875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.31891539692878723, "epoch": 2.642156862745098, "frac_reward_zero_std": 0.25, "grad_norm": 2.066544584098449, "kl": 0.14812368154525757, "learning_rate": 4.3808626352127066e-08, "loss": 0.0771, "num_tokens": 67887218.0, "reward": 0.25, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 1.0007492303848267, "sampling/importance_sampling_ratio/min": 0.2254074215888977, "sampling/sampling_logp_difference/max": 1.4898457527160645, "sampling/sampling_logp_difference/mean": 0.017643041908740997, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 189.484375, "completions/mean_terminated_length": 189.484375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.24129286408424377, "epoch": 2.6433823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 1.2105055067435708, "kl": 0.08592385798692703, "learning_rate": 4.351748545012057e-08, "loss": 0.0148, "num_tokens": 67916353.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996558427810669, "sampling/importance_sampling_ratio/min": 0.4141593277454376, "sampling/sampling_logp_difference/max": 0.8815045356750488, "sampling/sampling_logp_difference/mean": 0.01417774148285389, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 160.453125, "completions/mean_terminated_length": 160.453125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.18149279057979584, "epoch": 2.644607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.039533652847857564, "kl": 0.08074481785297394, "learning_rate": 4.322727117869951e-08, "loss": 0.0007, "num_tokens": 67946542.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 1.0003111362457275, "sampling/importance_sampling_ratio/min": 0.4830726981163025, "sampling/sampling_logp_difference/max": 0.7275881767272949, "sampling/sampling_logp_difference/mean": 0.013689331710338593, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 171.734375, "completions/mean_terminated_length": 171.734375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22633984684944153, "epoch": 2.6458333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.12284008384429417, "kl": 0.12330029159784317, "learning_rate": 4.2937984126980686e-08, "loss": 0.0011, "num_tokens": 67972701.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4916808605194092, "sampling/importance_sampling_ratio/mean": 1.0010933876037598, "sampling/importance_sampling_ratio/min": 0.4786601662635803, "sampling/sampling_logp_difference/max": 0.7367644309997559, "sampling/sampling_logp_difference/mean": 0.012793360278010368, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 195.65625, "completions/mean_terminated_length": 195.65625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3101569414138794, "epoch": 2.6470588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.19164975738539, "kl": 0.11972618848085403, "learning_rate": 4.2649624882198196e-08, "loss": -0.0011, "num_tokens": 68005511.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999502301216125, "sampling/importance_sampling_ratio/min": 0.5634000897407532, "sampling/sampling_logp_difference/max": 0.7547166347503662, "sampling/sampling_logp_difference/mean": 0.017234966158866882, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 150.84375, "completions/mean_terminated_length": 150.84375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.290639191865921, "epoch": 2.6482843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.08913893913290491, "kl": 0.16944536566734314, "learning_rate": 4.2362194029703256e-08, "loss": 0.0018, "num_tokens": 68030029.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.967354655265808, "sampling/importance_sampling_ratio/mean": 1.0003464221954346, "sampling/importance_sampling_ratio/min": 0.42517244815826416, "sampling/sampling_logp_difference/max": 0.8552604913711548, "sampling/sampling_logp_difference/mean": 0.01711418852210045, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 184.828125, "completions/mean_terminated_length": 184.828125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2637135982513428, "epoch": 2.6495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.2258549695200438, "kl": 0.1391540765762329, "learning_rate": 4.207569215296214e-08, "loss": 0.0077, "num_tokens": 68059634.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4652819633483887, "sampling/importance_sampling_ratio/mean": 0.9994097948074341, "sampling/importance_sampling_ratio/min": 0.6319909691810608, "sampling/sampling_logp_difference/max": 0.4588801860809326, "sampling/sampling_logp_difference/mean": 0.014466384425759315, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 212.84375, "completions/mean_terminated_length": 212.84375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.1459178626537323, "epoch": 2.650735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02709128132491764, "kl": 0.027253184467554092, "learning_rate": 4.179011983355568e-08, "loss": 0.0003, "num_tokens": 68098072.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7416507005691528, "sampling/importance_sampling_ratio/mean": 1.0003209114074707, "sampling/importance_sampling_ratio/min": 0.3821437656879425, "sampling/sampling_logp_difference/max": 0.9619584083557129, "sampling/sampling_logp_difference/mean": 0.010496910661458969, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 196.390625, "completions/mean_terminated_length": 196.390625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.21812394261360168, "epoch": 2.6519607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.0931667172856299, "kl": 0.07285928726196289, "learning_rate": 4.150547765117746e-08, "loss": -0.0063, "num_tokens": 68126257.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5161488056182861, "sampling/importance_sampling_ratio/mean": 0.9997991919517517, "sampling/importance_sampling_ratio/min": 0.5910096764564514, "sampling/sampling_logp_difference/max": 0.5259228944778442, "sampling/sampling_logp_difference/mean": 0.012680470943450928, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 203.5, "completions/mean_terminated_length": 203.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.21613028645515442, "epoch": 2.653186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.5038236539469443, "kl": 0.07153953611850739, "learning_rate": 4.1221766183633045e-08, "loss": -0.009, "num_tokens": 68162817.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4761751890182495, "sampling/importance_sampling_ratio/mean": 1.0000369548797607, "sampling/importance_sampling_ratio/min": 0.4108840227127075, "sampling/sampling_logp_difference/max": 0.8894443511962891, "sampling/sampling_logp_difference/mean": 0.013531733304262161, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 143.65625, "completions/mean_terminated_length": 143.65625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.16796152293682098, "epoch": 2.6544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.056318582333555633, "kl": 0.05783125013113022, "learning_rate": 4.0938986006838926e-08, "loss": 0.0006, "num_tokens": 68187451.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6359341144561768, "sampling/importance_sampling_ratio/mean": 0.9998639822006226, "sampling/importance_sampling_ratio/min": 0.6097320318222046, "sampling/sampling_logp_difference/max": 0.4947357177734375, "sampling/sampling_logp_difference/mean": 0.010606948286294937, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2929326891899109, "epoch": 2.655637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 2.2168245171692678, "kl": 0.14285142719745636, "learning_rate": 4.065713769482082e-08, "loss": 0.0052, "num_tokens": 68218891.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998737573623657, "sampling/importance_sampling_ratio/min": 0.46241626143455505, "sampling/sampling_logp_difference/max": 0.7988262176513672, "sampling/sampling_logp_difference/mean": 0.016140636056661606, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 155.515625, "completions/mean_terminated_length": 155.515625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2612577974796295, "epoch": 2.656862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.547020456094274, "kl": 0.09141572564840317, "learning_rate": 4.037622181971295e-08, "loss": 0.0108, "num_tokens": 68245932.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008686780929565, "sampling/importance_sampling_ratio/min": 0.5016920566558838, "sampling/sampling_logp_difference/max": 0.9166195392608643, "sampling/sampling_logp_difference/mean": 0.015860222280025482, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 187.09375, "completions/mean_terminated_length": 187.09375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.30533725023269653, "epoch": 2.6580882352941178, "frac_reward_zero_std": 0.25, "grad_norm": 2.1363831956584387, "kl": 0.15566954016685486, "learning_rate": 4.009623895175662e-08, "loss": 0.032, "num_tokens": 68275042.0, "reward": 0.375, "reward_std": 0.6789814233779907, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 1.0000407695770264, "sampling/importance_sampling_ratio/min": 0.4782702624797821, "sampling/sampling_logp_difference/max": 0.737579345703125, "sampling/sampling_logp_difference/mean": 0.015945661813020706, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 191.84375, "completions/mean_terminated_length": 191.84375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22283464670181274, "epoch": 2.659313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.5947561048545837, "kl": 0.07634736597537994, "learning_rate": 3.981718965929959e-08, "loss": 0.023, "num_tokens": 68309816.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.894351601600647, "sampling/importance_sampling_ratio/mean": 0.9995042681694031, "sampling/importance_sampling_ratio/min": 0.6615972518920898, "sampling/sampling_logp_difference/max": 0.6388766765594482, "sampling/sampling_logp_difference/mean": 0.01337276678532362, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 173.484375, "completions/mean_terminated_length": 173.484375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.18940766155719757, "epoch": 2.6605392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 2.0193577112878476, "kl": 0.0752667635679245, "learning_rate": 3.953907450879407e-08, "loss": 0.0264, "num_tokens": 68335895.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8179810047149658, "sampling/importance_sampling_ratio/mean": 1.000649094581604, "sampling/importance_sampling_ratio/min": 0.41839301586151123, "sampling/sampling_logp_difference/max": 0.8713340759277344, "sampling/sampling_logp_difference/mean": 0.012507520616054535, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 143.71875, "completions/mean_terminated_length": 143.71875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.21278055012226105, "epoch": 2.661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.06850182894093394, "kl": 0.05794823169708252, "learning_rate": 3.926189406479613e-08, "loss": 0.0006, "num_tokens": 68368245.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9695031642913818, "sampling/importance_sampling_ratio/mean": 1.0009080171585083, "sampling/importance_sampling_ratio/min": 0.44676312804222107, "sampling/sampling_logp_difference/max": 0.8057267665863037, "sampling/sampling_logp_difference/mean": 0.015010137110948563, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 182.453125, "completions/mean_terminated_length": 182.453125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2997394800186157, "epoch": 2.6629901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.876729619808282, "kl": 0.1063377782702446, "learning_rate": 3.898564888996475e-08, "loss": 0.0048, "num_tokens": 68397090.0, "reward": 0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.590027093887329, "sampling/importance_sampling_ratio/mean": 1.0003613233566284, "sampling/importance_sampling_ratio/min": 0.5680696368217468, "sampling/sampling_logp_difference/max": 0.5655112266540527, "sampling/sampling_logp_difference/mean": 0.016017910093069077, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 250.25, "completions/mean_terminated_length": 250.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.24907062947750092, "epoch": 2.6642156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 1.284869883192685, "kl": 0.12940922379493713, "learning_rate": 3.871033954505998e-08, "loss": 0.0139, "num_tokens": 68427010.0, "reward": 0.375, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7548948526382446, "sampling/importance_sampling_ratio/mean": 0.9992954730987549, "sampling/importance_sampling_ratio/min": 0.5566850900650024, "sampling/sampling_logp_difference/max": 0.5857555270195007, "sampling/sampling_logp_difference/mean": 0.013966716825962067, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 171.109375, "completions/mean_terminated_length": 171.109375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2446766197681427, "epoch": 2.6654411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 2.1051873344208007, "kl": 0.08149237930774689, "learning_rate": 3.843596658894232e-08, "loss": -0.019, "num_tokens": 68456169.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9990978837013245, "sampling/importance_sampling_ratio/min": 0.5058292150497437, "sampling/sampling_logp_difference/max": 0.729045033454895, "sampling/sampling_logp_difference/mean": 0.014238040894269943, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2134552299976349, "epoch": 2.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.07743073911001033, "kl": 0.1175810694694519, "learning_rate": 3.816253057857144e-08, "loss": 0.0011, "num_tokens": 68482337.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000868558883667, "sampling/importance_sampling_ratio/min": 0.5550065040588379, "sampling/sampling_logp_difference/max": 0.709916353225708, "sampling/sampling_logp_difference/mean": 0.014420795254409313, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 174.546875, "completions/mean_terminated_length": 174.546875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.23113234341144562, "epoch": 2.667892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.056168849697236796, "kl": 0.0727619156241417, "learning_rate": 3.789003206900537e-08, "loss": 0.0007, "num_tokens": 68515156.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5723931789398193, "sampling/importance_sampling_ratio/mean": 1.0006955862045288, "sampling/importance_sampling_ratio/min": 0.3827452063560486, "sampling/sampling_logp_difference/max": 0.960385799407959, "sampling/sampling_logp_difference/mean": 0.015615207143127918, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 202.28125, "completions/mean_terminated_length": 202.28125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2749329209327698, "epoch": 2.6691176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.6762929868067675, "kl": 0.09398925304412842, "learning_rate": 3.7618471613398597e-08, "loss": -0.0335, "num_tokens": 68552822.0, "reward": 0.4375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997743368148804, "sampling/importance_sampling_ratio/min": 0.29177358746528625, "sampling/sampling_logp_difference/max": 1.2317771911621094, "sampling/sampling_logp_difference/mean": 0.01894037052989006, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 202.796875, "completions/mean_terminated_length": 202.796875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3282843828201294, "epoch": 2.670343137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.6705478242006395, "kl": 0.13938282430171967, "learning_rate": 3.734784976300165e-08, "loss": -0.018, "num_tokens": 68587961.0, "reward": 0.1875, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6016663312911987, "sampling/importance_sampling_ratio/mean": 0.999578058719635, "sampling/importance_sampling_ratio/min": 0.4243606925010681, "sampling/sampling_logp_difference/max": 0.8571715354919434, "sampling/sampling_logp_difference/mean": 0.017624136060476303, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 145.421875, "completions/mean_terminated_length": 145.421875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.20929139852523804, "epoch": 2.6715686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.6164602959815932, "kl": 0.10157999396324158, "learning_rate": 3.7078167067159826e-08, "loss": -0.0035, "num_tokens": 68612260.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.7619878053665161, "sampling/importance_sampling_ratio/mean": 1.0003855228424072, "sampling/importance_sampling_ratio/min": 0.6008952856063843, "sampling/sampling_logp_difference/max": 0.566442608833313, "sampling/sampling_logp_difference/mean": 0.012949692085385323, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.25185340642929077, "epoch": 2.672794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.1774559193845284, "kl": 0.08610805869102478, "learning_rate": 3.6809424073311944e-08, "loss": 0.0065, "num_tokens": 68643444.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.855698823928833, "sampling/importance_sampling_ratio/mean": 1.0000008344650269, "sampling/importance_sampling_ratio/min": 0.3847578763961792, "sampling/sampling_logp_difference/max": 0.9551410675048828, "sampling/sampling_logp_difference/mean": 0.018331876024603844, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.22100377082824707, "epoch": 2.674019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.6092179363665533, "kl": 0.09422071278095245, "learning_rate": 3.654162132698918e-08, "loss": 0.0541, "num_tokens": 68670236.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6084016561508179, "sampling/importance_sampling_ratio/mean": 1.0002474784851074, "sampling/importance_sampling_ratio/min": 0.49823063611984253, "sampling/sampling_logp_difference/max": 0.6966922283172607, "sampling/sampling_logp_difference/mean": 0.012405113317072392, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 216.40625, "completions/mean_terminated_length": 216.40625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24076037108898163, "epoch": 2.6752450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.0609488661572175, "kl": 0.12102796882390976, "learning_rate": 3.627475937181407e-08, "loss": 0.009, "num_tokens": 68705414.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6280242204666138, "sampling/importance_sampling_ratio/mean": 1.0000336170196533, "sampling/importance_sampling_ratio/min": 0.5584572553634644, "sampling/sampling_logp_difference/max": 0.5825772285461426, "sampling/sampling_logp_difference/mean": 0.01332184486091137, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 172.8125, "completions/mean_terminated_length": 172.8125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.24437132477760315, "epoch": 2.6764705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 2.127945876833359, "kl": 0.09145861119031906, "learning_rate": 3.600883874949967e-08, "loss": 0.0149, "num_tokens": 68732266.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5333822965621948, "sampling/importance_sampling_ratio/mean": 1.0007474422454834, "sampling/importance_sampling_ratio/min": 0.5185667276382446, "sampling/sampling_logp_difference/max": 0.656686544418335, "sampling/sampling_logp_difference/mean": 0.014701835811138153, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 206.671875, "completions/mean_terminated_length": 206.671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2812899649143219, "epoch": 2.6776960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.2944153670239382, "kl": 0.0997249186038971, "learning_rate": 3.574385999984786e-08, "loss": 0.0004, "num_tokens": 68763621.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6091861724853516, "sampling/importance_sampling_ratio/mean": 1.000199556350708, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.015883859246969223, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 187.96875, "completions/mean_terminated_length": 187.96875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.18499572575092316, "epoch": 2.678921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.462843051683198, "kl": 0.06560080498456955, "learning_rate": 3.54798236607487e-08, "loss": -0.0288, "num_tokens": 68790355.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5314031839370728, "sampling/importance_sampling_ratio/mean": 0.9995941519737244, "sampling/importance_sampling_ratio/min": 0.2466481328010559, "sampling/sampling_logp_difference/max": 1.3997925519943237, "sampling/sampling_logp_difference/mean": 0.012036822736263275, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 182.6875, "completions/mean_terminated_length": 182.6875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2512951195240021, "epoch": 2.6801470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.4222979639673834, "kl": 0.07893835008144379, "learning_rate": 3.5216730268179337e-08, "loss": -0.0234, "num_tokens": 68824447.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6204001903533936, "sampling/importance_sampling_ratio/mean": 1.0002648830413818, "sampling/importance_sampling_ratio/min": 0.6091734766960144, "sampling/sampling_logp_difference/max": 0.4956521987915039, "sampling/sampling_logp_difference/mean": 0.014549204148352146, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 165.203125, "completions/mean_terminated_length": 165.203125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.22966158390045166, "epoch": 2.681372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.3408040168452255, "kl": 0.07859854400157928, "learning_rate": 3.495458035620252e-08, "loss": 0.0078, "num_tokens": 68851980.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8719398975372314, "sampling/importance_sampling_ratio/mean": 0.9992271661758423, "sampling/importance_sampling_ratio/min": 0.5702707171440125, "sampling/sampling_logp_difference/max": 0.6269752979278564, "sampling/sampling_logp_difference/mean": 0.014652803540229797, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 223.046875, "completions/mean_terminated_length": 223.046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.29275381565093994, "epoch": 2.6825980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.282237347488146, "kl": 0.12887243926525116, "learning_rate": 3.469337445696629e-08, "loss": -0.1028, "num_tokens": 68883711.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.7657581567764282, "sampling/importance_sampling_ratio/mean": 0.9999592304229736, "sampling/importance_sampling_ratio/min": 0.47417914867401123, "sampling/sampling_logp_difference/max": 0.7461700439453125, "sampling/sampling_logp_difference/mean": 0.01636071503162384, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 174.046875, "completions/mean_terminated_length": 174.046875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2956045866012573, "epoch": 2.6838235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.439245239378223, "kl": 0.11607477068901062, "learning_rate": 3.4433113100701683e-08, "loss": -0.0151, "num_tokens": 68911890.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007710456848145, "sampling/importance_sampling_ratio/min": 0.5396926403045654, "sampling/sampling_logp_difference/max": 0.776573657989502, "sampling/sampling_logp_difference/mean": 0.016570597887039185, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.22272804379463196, "epoch": 2.685049019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.3186346124254964, "kl": 0.06915993988513947, "learning_rate": 3.417379681572296e-08, "loss": 0.0034, "num_tokens": 68941978.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6963083744049072, "sampling/importance_sampling_ratio/mean": 0.9999330639839172, "sampling/importance_sampling_ratio/min": 0.42088425159454346, "sampling/sampling_logp_difference/max": 0.8653974533081055, "sampling/sampling_logp_difference/mean": 0.014592324383556843, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 168.34375, "completions/mean_terminated_length": 168.34375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2185148000717163, "epoch": 2.686274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 1.230048376251708, "kl": 0.08221367746591568, "learning_rate": 3.391542612842574e-08, "loss": -0.0049, "num_tokens": 68972128.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7236268520355225, "sampling/importance_sampling_ratio/mean": 1.000931978225708, "sampling/importance_sampling_ratio/min": 0.5048971176147461, "sampling/sampling_logp_difference/max": 0.6834006309509277, "sampling/sampling_logp_difference/mean": 0.014054270461201668, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 209.46875, "completions/mean_terminated_length": 209.46875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.22842177748680115, "epoch": 2.6875, "frac_reward_zero_std": 0.5, "grad_norm": 1.6492742690847177, "kl": 0.09010639786720276, "learning_rate": 3.365800156328619e-08, "loss": -0.0249, "num_tokens": 69004974.0, "reward": 0.0625, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7725313901901245, "sampling/importance_sampling_ratio/mean": 0.9995955228805542, "sampling/importance_sampling_ratio/min": 0.6161969900131226, "sampling/sampling_logp_difference/max": 0.5724086761474609, "sampling/sampling_logp_difference/mean": 0.012595223262906075, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 154.90625, "completions/mean_terminated_length": 154.90625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.25736111402511597, "epoch": 2.688725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.05073242573608991, "kl": 0.07285177707672119, "learning_rate": 3.3401523642859805e-08, "loss": 0.0007, "num_tokens": 69036392.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5882704257965088, "sampling/importance_sampling_ratio/mean": 0.9993676543235779, "sampling/importance_sampling_ratio/min": 0.41003796458244324, "sampling/sampling_logp_difference/max": 0.8915054798126221, "sampling/sampling_logp_difference/mean": 0.017075642943382263, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 147.296875, "completions/mean_terminated_length": 147.296875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.21071837842464447, "epoch": 2.689950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.3758237494845689, "kl": 0.06788364052772522, "learning_rate": 3.3145992887780475e-08, "loss": -0.0024, "num_tokens": 69062539.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5085123777389526, "sampling/importance_sampling_ratio/mean": 0.9999905228614807, "sampling/importance_sampling_ratio/min": 0.49252715706825256, "sampling/sampling_logp_difference/max": 0.7082056999206543, "sampling/sampling_logp_difference/mean": 0.01327443402260542, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 205.078125, "completions/mean_terminated_length": 205.078125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.23580729961395264, "epoch": 2.6911764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.203832190676799, "kl": 0.07924351096153259, "learning_rate": 3.289140981675964e-08, "loss": -0.0017, "num_tokens": 69093408.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6272674798965454, "sampling/importance_sampling_ratio/mean": 1.0008111000061035, "sampling/importance_sampling_ratio/min": 0.6301074028015137, "sampling/sampling_logp_difference/max": 0.48690223693847656, "sampling/sampling_logp_difference/mean": 0.013151616789400578, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 245.359375, "completions/mean_terminated_length": 245.359375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.2598392367362976, "epoch": 2.6924019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.045813206867263846, "kl": 0.07030776143074036, "learning_rate": 3.263777494658448e-08, "loss": 0.0007, "num_tokens": 69130663.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5993943214416504, "sampling/importance_sampling_ratio/mean": 1.0002115964889526, "sampling/importance_sampling_ratio/min": 0.535071074962616, "sampling/sampling_logp_difference/max": 0.6253557205200195, "sampling/sampling_logp_difference/mean": 0.01403868943452835, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.20189495384693146, "epoch": 2.693627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 1.1070948841503783, "kl": 0.1220339834690094, "learning_rate": 3.2385088792118044e-08, "loss": 0.0019, "num_tokens": 69159847.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6558984518051147, "sampling/importance_sampling_ratio/mean": 1.0001466274261475, "sampling/importance_sampling_ratio/min": 0.6061429977416992, "sampling/sampling_logp_difference/max": 0.5043437480926514, "sampling/sampling_logp_difference/mean": 0.01365315355360508, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.28888899087905884, "epoch": 2.6948529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.07784601669417482, "kl": 0.1149665042757988, "learning_rate": 3.2133351866296955e-08, "loss": 0.0013, "num_tokens": 69191559.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006163120269775, "sampling/importance_sampling_ratio/min": 0.4392172694206238, "sampling/sampling_logp_difference/max": 0.822761058807373, "sampling/sampling_logp_difference/mean": 0.016775013878941536, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 204.65625, "completions/mean_terminated_length": 204.65625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.22148284316062927, "epoch": 2.696078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.9038119411824908, "kl": 0.05128861218690872, "learning_rate": 3.188256468013139e-08, "loss": 0.0162, "num_tokens": 69223041.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5878969430923462, "sampling/importance_sampling_ratio/mean": 0.9997825622558594, "sampling/importance_sampling_ratio/min": 0.553889274597168, "sampling/sampling_logp_difference/max": 0.5907905101776123, "sampling/sampling_logp_difference/mean": 0.01325621921569109, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 159.0625, "completions/mean_terminated_length": 159.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.28267282247543335, "epoch": 2.6973039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.8340136244817302, "kl": 0.17508764564990997, "learning_rate": 3.163272774270348e-08, "loss": -0.0049, "num_tokens": 69247621.0, "reward": 0.3125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6154650449752808, "sampling/importance_sampling_ratio/mean": 0.999939501285553, "sampling/importance_sampling_ratio/min": 0.5038948059082031, "sampling/sampling_logp_difference/max": 0.6853878498077393, "sampling/sampling_logp_difference/mean": 0.015061620622873306, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 145.640625, "completions/mean_terminated_length": 145.640625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2137831449508667, "epoch": 2.6985294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 1.6664174079426848, "kl": 0.10704906284809113, "learning_rate": 3.1383841561166134e-08, "loss": 0.0061, "num_tokens": 69269486.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.456510066986084, "sampling/importance_sampling_ratio/mean": 0.999748945236206, "sampling/importance_sampling_ratio/min": 0.4057944118976593, "sampling/sampling_logp_difference/max": 0.9019086360931396, "sampling/sampling_logp_difference/mean": 0.013272078707814217, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 208.234375, "completions/mean_terminated_length": 208.234375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2738074064254761, "epoch": 2.6997549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.7830345545165356, "kl": 0.12862104177474976, "learning_rate": 3.1135906640742836e-08, "loss": 0.0934, "num_tokens": 69301181.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6089228391647339, "sampling/importance_sampling_ratio/mean": 1.0008376836776733, "sampling/importance_sampling_ratio/min": 0.35846590995788574, "sampling/sampling_logp_difference/max": 1.0259218215942383, "sampling/sampling_logp_difference/mean": 0.01580899953842163, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 189.5625, "completions/mean_terminated_length": 189.5625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.24207505583763123, "epoch": 2.700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.707672381286519, "kl": 0.07233038544654846, "learning_rate": 3.088892348472561e-08, "loss": 0.0326, "num_tokens": 69331793.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5470654964447021, "sampling/importance_sampling_ratio/mean": 1.0001225471496582, "sampling/importance_sampling_ratio/min": 0.3600119948387146, "sampling/sampling_logp_difference/max": 1.0216180086135864, "sampling/sampling_logp_difference/mean": 0.014072047546505928, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 183.578125, "completions/mean_terminated_length": 183.578125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2872757911682129, "epoch": 2.702205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 1.2622679481807824, "kl": 0.13634923100471497, "learning_rate": 3.064289259447455e-08, "loss": 0.0036, "num_tokens": 69357414.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5744361877441406, "sampling/importance_sampling_ratio/mean": 1.000053882598877, "sampling/importance_sampling_ratio/min": 0.5513951778411865, "sampling/sampling_logp_difference/max": 0.5953035354614258, "sampling/sampling_logp_difference/mean": 0.01701120287179947, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3441547751426697, "epoch": 2.7034313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 2.069953362663289, "kl": 0.13246379792690277, "learning_rate": 3.039781446941697e-08, "loss": -0.0108, "num_tokens": 69390014.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.86025071144104, "sampling/importance_sampling_ratio/mean": 1.0000040531158447, "sampling/importance_sampling_ratio/min": 0.6281057000160217, "sampling/sampling_logp_difference/max": 0.6207113265991211, "sampling/sampling_logp_difference/mean": 0.016756637021899223, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 141.234375, "completions/mean_terminated_length": 141.234375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2297401875257492, "epoch": 2.704656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.13295957964192232, "kl": 0.08374090492725372, "learning_rate": 3.015368960704584e-08, "loss": 0.0009, "num_tokens": 69416845.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001006126403809, "sampling/importance_sampling_ratio/min": 0.4029476046562195, "sampling/sampling_logp_difference/max": 0.9089487791061401, "sampling/sampling_logp_difference/mean": 0.016758276149630547, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 200.59375, "completions/mean_terminated_length": 200.59375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.27449631690979004, "epoch": 2.7058823529411766, "frac_reward_zero_std": 0.25, "grad_norm": 2.303810297427504, "kl": 0.11424297839403152, "learning_rate": 2.991051850291915e-08, "loss": 0.0132, "num_tokens": 69444211.0, "reward": 0.40625, "reward_std": 0.5431214570999146, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8529949188232422, "sampling/importance_sampling_ratio/mean": 1.00038480758667, "sampling/importance_sampling_ratio/min": 0.3628750145435333, "sampling/sampling_logp_difference/max": 1.0136967897415161, "sampling/sampling_logp_difference/mean": 0.015266727656126022, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 181.15625, "completions/mean_terminated_length": 181.15625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.23630809783935547, "epoch": 2.707107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.2042021830786684, "kl": 0.10635527968406677, "learning_rate": 2.9668301650658756e-08, "loss": -0.0039, "num_tokens": 69476605.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997677803039551, "sampling/importance_sampling_ratio/min": 0.4353478252887726, "sampling/sampling_logp_difference/max": 0.8316099643707275, "sampling/sampling_logp_difference/mean": 0.014236892573535442, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 221.9375, "completions/mean_terminated_length": 221.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20373618602752686, "epoch": 2.7083333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.8347800298068822, "kl": 0.05157628282904625, "learning_rate": 2.9427039541949638e-08, "loss": -0.0672, "num_tokens": 69507161.0, "reward": 0.125, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 1.0000061988830566, "sampling/importance_sampling_ratio/min": 0.6146666407585144, "sampling/sampling_logp_difference/max": 0.48689985275268555, "sampling/sampling_logp_difference/mean": 0.010913224890828133, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 159.203125, "completions/mean_terminated_length": 159.203125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2779569625854492, "epoch": 2.7095588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 2.08142156977234, "kl": 0.08717378973960876, "learning_rate": 2.918673266653865e-08, "loss": -0.0266, "num_tokens": 69535110.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6978199481964111, "sampling/importance_sampling_ratio/mean": 0.9993019104003906, "sampling/importance_sampling_ratio/min": 0.49558350443840027, "sampling/sampling_logp_difference/max": 0.702019453048706, "sampling/sampling_logp_difference/mean": 0.016375428065657616, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 151.96875, "completions/mean_terminated_length": 151.96875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.20503515005111694, "epoch": 2.7107843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.4751383794409036, "kl": 0.07435000687837601, "learning_rate": 2.8947381512233305e-08, "loss": 0.0156, "num_tokens": 69562356.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9469518661499023, "sampling/importance_sampling_ratio/mean": 1.0001492500305176, "sampling/importance_sampling_ratio/min": 0.48148271441459656, "sampling/sampling_logp_difference/max": 0.7308850288391113, "sampling/sampling_logp_difference/mean": 0.014628005214035511, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 211.53125, "completions/mean_terminated_length": 211.53125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.23537412285804749, "epoch": 2.7120098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.3870547483022524, "kl": 0.07981985807418823, "learning_rate": 2.8708986564901504e-08, "loss": -0.0097, "num_tokens": 69594150.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6990214586257935, "sampling/importance_sampling_ratio/mean": 1.000446081161499, "sampling/importance_sampling_ratio/min": 0.4834754765033722, "sampling/sampling_logp_difference/max": 0.7267546653747559, "sampling/sampling_logp_difference/mean": 0.015160782262682915, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 181.890625, "completions/mean_terminated_length": 181.890625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.17048130929470062, "epoch": 2.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.04539440661198622, "kl": 0.04399728402495384, "learning_rate": 2.8471548308469706e-08, "loss": 0.0004, "num_tokens": 69617903.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6598318815231323, "sampling/importance_sampling_ratio/mean": 1.0003302097320557, "sampling/importance_sampling_ratio/min": 0.536295473575592, "sampling/sampling_logp_difference/max": 0.6230700016021729, "sampling/sampling_logp_difference/mean": 0.012363039888441563, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 180.59375, "completions/mean_terminated_length": 180.59375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26669445633888245, "epoch": 2.7144607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.5988910701713328, "kl": 0.13101765513420105, "learning_rate": 2.8235067224922802e-08, "loss": 0.0155, "num_tokens": 69644197.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.619104266166687, "sampling/importance_sampling_ratio/mean": 0.9997296929359436, "sampling/importance_sampling_ratio/min": 0.5253884196281433, "sampling/sampling_logp_difference/max": 0.6436173915863037, "sampling/sampling_logp_difference/mean": 0.015141832642257214, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 231.1875, "completions/mean_terminated_length": 231.1875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.23241564631462097, "epoch": 2.715686274509804, "frac_reward_zero_std": 0.25, "grad_norm": 1.8356100270459297, "kl": 0.08637715876102448, "learning_rate": 2.799954379430208e-08, "loss": -0.0297, "num_tokens": 69680689.0, "reward": 0.71875, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.697717308998108, "sampling/importance_sampling_ratio/mean": 1.000471591949463, "sampling/importance_sampling_ratio/min": 0.4300449788570404, "sampling/sampling_logp_difference/max": 0.8438655138015747, "sampling/sampling_logp_difference/mean": 0.014142333529889584, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 185.5625, "completions/mean_terminated_length": 185.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2993847131729126, "epoch": 2.7169117647058822, "frac_reward_zero_std": 0.25, "grad_norm": 2.152223952848721, "kl": 0.15374311804771423, "learning_rate": 2.7764978494705437e-08, "loss": -0.0387, "num_tokens": 69712949.0, "reward": 0.28125, "reward_std": 0.7643726468086243, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6132571697235107, "sampling/importance_sampling_ratio/mean": 0.9998944997787476, "sampling/importance_sampling_ratio/min": 0.45971566438674927, "sampling/sampling_logp_difference/max": 0.7771470546722412, "sampling/sampling_logp_difference/mean": 0.015908479690551758, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 177.203125, "completions/mean_terminated_length": 177.203125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.16632476449012756, "epoch": 2.718137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.3446729340689112, "kl": 0.05673764646053314, "learning_rate": 2.753137180228543e-08, "loss": -0.0011, "num_tokens": 69737010.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8623982667922974, "sampling/importance_sampling_ratio/mean": 0.9996699094772339, "sampling/importance_sampling_ratio/min": 0.47910451889038086, "sampling/sampling_logp_difference/max": 0.7358365058898926, "sampling/sampling_logp_difference/mean": 0.01236045453697443, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 225.03125, "completions/mean_terminated_length": 225.03125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3208922743797302, "epoch": 2.719362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.8880314209417628, "kl": 0.09001030772924423, "learning_rate": 2.729872419124879e-08, "loss": -0.0076, "num_tokens": 69768868.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.513028860092163, "sampling/importance_sampling_ratio/mean": 1.0003256797790527, "sampling/importance_sampling_ratio/min": 0.47280260920524597, "sampling/sampling_logp_difference/max": 0.749077320098877, "sampling/sampling_logp_difference/mean": 0.016022734344005585, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 156.109375, "completions/mean_terminated_length": 156.109375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.21761839091777802, "epoch": 2.7205882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.760803295294139, "kl": 0.10163723677396774, "learning_rate": 2.7067036133855636e-08, "loss": -0.0109, "num_tokens": 69797995.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6356077194213867, "sampling/importance_sampling_ratio/mean": 1.0007399320602417, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.5662540197372437, "sampling/sampling_logp_difference/mean": 0.014694325625896454, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 142.234375, "completions/mean_terminated_length": 142.234375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2053075134754181, "epoch": 2.721813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0467044526354985, "kl": 0.07528539001941681, "learning_rate": 2.6836308100417872e-08, "loss": 0.0008, "num_tokens": 69825258.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996945261955261, "sampling/importance_sampling_ratio/min": 0.613776445388794, "sampling/sampling_logp_difference/max": 0.955998420715332, "sampling/sampling_logp_difference/mean": 0.012028173543512821, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 162.890625, "completions/mean_terminated_length": 162.890625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2621799111366272, "epoch": 2.7230392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.2452878397876777, "kl": 0.11138811707496643, "learning_rate": 2.6606540559298952e-08, "loss": 0.0125, "num_tokens": 69854243.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.627645492553711, "sampling/importance_sampling_ratio/mean": 1.0003089904785156, "sampling/importance_sampling_ratio/min": 0.621735692024231, "sampling/sampling_logp_difference/max": 0.4871344566345215, "sampling/sampling_logp_difference/mean": 0.013921591453254223, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 204.71875, "completions/mean_terminated_length": 204.71875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.18779292702674866, "epoch": 2.724264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.039374648181891785, "kl": 0.04470675811171532, "learning_rate": 2.6377733976912232e-08, "loss": 0.0004, "num_tokens": 69881889.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7185267210006714, "sampling/importance_sampling_ratio/mean": 1.000443696975708, "sampling/importance_sampling_ratio/min": 0.48124879598617554, "sampling/sampling_logp_difference/max": 0.7313709259033203, "sampling/sampling_logp_difference/mean": 0.013233638368546963, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 191.03125, "completions/mean_terminated_length": 191.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.24338199198246002, "epoch": 2.7254901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.07125307899557784, "kl": 0.08830739557743073, "learning_rate": 2.6149888817720733e-08, "loss": 0.001, "num_tokens": 69913427.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8971585035324097, "sampling/importance_sampling_ratio/mean": 1.0001130104064941, "sampling/importance_sampling_ratio/min": 0.4601728022098541, "sampling/sampling_logp_difference/max": 0.7761532068252563, "sampling/sampling_logp_difference/mean": 0.014650973491370678, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 196.484375, "completions/mean_terminated_length": 196.484375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.23095597326755524, "epoch": 2.7267156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.04004652586011617, "kl": 0.04922451078891754, "learning_rate": 2.5923005544235545e-08, "loss": 0.0005, "num_tokens": 69949442.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6214406490325928, "sampling/importance_sampling_ratio/mean": 1.0002517700195312, "sampling/importance_sampling_ratio/min": 0.4137243926525116, "sampling/sampling_logp_difference/max": 0.8825552463531494, "sampling/sampling_logp_difference/mean": 0.013647917658090591, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.27529871463775635, "epoch": 2.7279411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.19390980695616739, "kl": 0.07142721861600876, "learning_rate": 2.5697084617015475e-08, "loss": 0.0007, "num_tokens": 69987986.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.674102783203125, "sampling/importance_sampling_ratio/mean": 0.9996586441993713, "sampling/importance_sampling_ratio/min": 0.3969041407108307, "sampling/sampling_logp_difference/max": 0.9240604639053345, "sampling/sampling_logp_difference/mean": 0.015878435224294662, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 224.015625, "completions/mean_terminated_length": 224.015625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.21713590621948242, "epoch": 2.7291666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.06094074724350776, "kl": 0.05968696251511574, "learning_rate": 2.547212649466568e-08, "loss": 0.0006, "num_tokens": 70024403.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998601675033569, "sampling/importance_sampling_ratio/min": 0.08962565660476685, "sampling/sampling_logp_difference/max": 2.412113666534424, "sampling/sampling_logp_difference/mean": 0.014678400941193104, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 155.484375, "completions/mean_terminated_length": 155.484375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.27618515491485596, "epoch": 2.730392156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.98056953965246, "kl": 0.15442435443401337, "learning_rate": 2.5248131633836823e-08, "loss": 0.0119, "num_tokens": 70057922.0, "reward": -0.375, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6089198589324951, "sampling/importance_sampling_ratio/mean": 0.9995495080947876, "sampling/importance_sampling_ratio/min": 0.5513139963150024, "sampling/sampling_logp_difference/max": 0.595450758934021, "sampling/sampling_logp_difference/mean": 0.017073802649974823, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 195.03125, "completions/mean_terminated_length": 195.03125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.30946943163871765, "epoch": 2.7316176470588234, "frac_reward_zero_std": 0.25, "grad_norm": 1.9276934903335479, "kl": 0.12449204176664352, "learning_rate": 2.5025100489224406e-08, "loss": 0.002, "num_tokens": 70089252.0, "reward": 0.875, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002161264419556, "sampling/importance_sampling_ratio/min": 0.5910323262214661, "sampling/sampling_logp_difference/max": 0.7526144981384277, "sampling/sampling_logp_difference/mean": 0.01713874749839306, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 236.46875, "completions/mean_terminated_length": 236.46875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.29705095291137695, "epoch": 2.732843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 7.14257301574741, "kl": 0.17638657987117767, "learning_rate": 2.480303351356733e-08, "loss": -0.001, "num_tokens": 70125650.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995177984237671, "sampling/importance_sampling_ratio/min": 0.006017514504492283, "sampling/sampling_logp_difference/max": 5.113080978393555, "sampling/sampling_logp_difference/mean": 0.016602501273155212, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.24172338843345642, "epoch": 2.7340686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.3895605040650352, "kl": 0.10444362461566925, "learning_rate": 2.4581931157647674e-08, "loss": 0.0014, "num_tokens": 70152914.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4717568159103394, "sampling/importance_sampling_ratio/mean": 0.9994391202926636, "sampling/importance_sampling_ratio/min": 0.5495074391365051, "sampling/sampling_logp_difference/max": 0.5987329483032227, "sampling/sampling_logp_difference/mean": 0.014560364186763763, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 173.40625, "completions/mean_terminated_length": 173.40625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.19719503819942474, "epoch": 2.735294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.05655211858590894, "kl": 0.053818728774785995, "learning_rate": 2.4361793870289028e-08, "loss": 0.0006, "num_tokens": 70182156.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7174328565597534, "sampling/importance_sampling_ratio/mean": 1.0005311965942383, "sampling/importance_sampling_ratio/min": 0.5483784079551697, "sampling/sampling_logp_difference/max": 0.6007896661758423, "sampling/sampling_logp_difference/mean": 0.012780500575900078, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 200.890625, "completions/mean_terminated_length": 200.890625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2707655429840088, "epoch": 2.736519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.3971192336391878, "kl": 0.08546942472457886, "learning_rate": 2.4142622098356326e-08, "loss": -0.0203, "num_tokens": 70213301.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007374286651611, "sampling/importance_sampling_ratio/min": 0.4582717716693878, "sampling/sampling_logp_difference/max": 1.49442720413208, "sampling/sampling_logp_difference/mean": 0.01658693701028824, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 199.40625, "completions/mean_terminated_length": 199.40625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24402666091918945, "epoch": 2.7377450980392157, "frac_reward_zero_std": 0.25, "grad_norm": 1.9094111257876882, "kl": 0.07704515755176544, "learning_rate": 2.3924416286754345e-08, "loss": -0.0374, "num_tokens": 70241343.0, "reward": 0.46875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002003908157349, "sampling/importance_sampling_ratio/min": 0.4584660232067108, "sampling/sampling_logp_difference/max": 0.7798690795898438, "sampling/sampling_logp_difference/mean": 0.014194745570421219, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 179.640625, "completions/mean_terminated_length": 179.640625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1955706626176834, "epoch": 2.7389705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.049345331508012194, "kl": 0.05805432051420212, "learning_rate": 2.3707176878426882e-08, "loss": 0.0006, "num_tokens": 70270840.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9099674224853516, "sampling/importance_sampling_ratio/mean": 1.0000171661376953, "sampling/importance_sampling_ratio/min": 0.5362978577613831, "sampling/sampling_logp_difference/max": 0.6470861434936523, "sampling/sampling_logp_difference/mean": 0.012442233972251415, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 141.59375, "completions/mean_terminated_length": 141.59375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.17659026384353638, "epoch": 2.7401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.08369664875217793, "kl": 0.07052315026521683, "learning_rate": 2.3490904314356407e-08, "loss": 0.0007, "num_tokens": 70297630.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5473384857177734, "sampling/importance_sampling_ratio/mean": 1.0005820989608765, "sampling/importance_sampling_ratio/min": 0.6202817559242249, "sampling/sampling_logp_difference/max": 0.4775815010070801, "sampling/sampling_logp_difference/mean": 0.010419541969895363, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 222.96875, "completions/mean_terminated_length": 222.96875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.18989717960357666, "epoch": 2.741421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.1308519337501126, "kl": 0.04831966757774353, "learning_rate": 2.327559903356241e-08, "loss": 0.0332, "num_tokens": 70337260.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999310314655304, "sampling/importance_sampling_ratio/min": 0.5668860673904419, "sampling/sampling_logp_difference/max": 0.7492504119873047, "sampling/sampling_logp_difference/mean": 0.011877134442329407, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 162.984375, "completions/mean_terminated_length": 162.984375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.19407083094120026, "epoch": 2.7426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.06408573025545625, "kl": 0.07185148447751999, "learning_rate": 2.3061261473101002e-08, "loss": 0.0007, "num_tokens": 70369323.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002174377441406, "sampling/importance_sampling_ratio/min": 0.6254509687423706, "sampling/sampling_logp_difference/max": 0.9231328964233398, "sampling/sampling_logp_difference/mean": 0.012973986566066742, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 189.78125, "completions/mean_terminated_length": 189.78125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3031126856803894, "epoch": 2.743872549019608, "frac_reward_zero_std": 0.5, "grad_norm": 1.9924284485712511, "kl": 0.12816128134727478, "learning_rate": 2.2847892068063755e-08, "loss": -0.0145, "num_tokens": 70403197.0, "reward": 0.5, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000558614730835, "sampling/importance_sampling_ratio/min": 0.48257020115852356, "sampling/sampling_logp_difference/max": 0.8784655332565308, "sampling/sampling_logp_difference/mean": 0.017242250964045525, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 209.640625, "completions/mean_terminated_length": 209.640625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.24757881462574005, "epoch": 2.7450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.041347249130428226, "kl": 0.05959932878613472, "learning_rate": 2.263549125157721e-08, "loss": 0.0006, "num_tokens": 70437350.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6208202838897705, "sampling/importance_sampling_ratio/mean": 0.9997664093971252, "sampling/importance_sampling_ratio/min": 0.19745701551437378, "sampling/sampling_logp_difference/max": 1.6222343444824219, "sampling/sampling_logp_difference/mean": 0.015598781406879425, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 179.703125, "completions/mean_terminated_length": 179.703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22596284747123718, "epoch": 2.7463235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.5586376139596525, "kl": 0.10207671672105789, "learning_rate": 2.242405945480147e-08, "loss": 0.004, "num_tokens": 70465043.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5484459400177002, "sampling/importance_sampling_ratio/mean": 1.000705599784851, "sampling/importance_sampling_ratio/min": 0.5239237546920776, "sampling/sampling_logp_difference/max": 0.6464091539382935, "sampling/sampling_logp_difference/mean": 0.01542605459690094, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 211.46875, "completions/mean_terminated_length": 211.46875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.23992714285850525, "epoch": 2.747549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.659197625635065, "kl": 0.05521634966135025, "learning_rate": 2.2213597106929605e-08, "loss": -0.0725, "num_tokens": 70500913.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.768282175064087, "sampling/importance_sampling_ratio/mean": 1.0001716613769531, "sampling/importance_sampling_ratio/min": 0.47808602452278137, "sampling/sampling_logp_difference/max": 0.7379646301269531, "sampling/sampling_logp_difference/mean": 0.016141241416335106, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 182.53125, "completions/mean_terminated_length": 182.53125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2644326388835907, "epoch": 2.748774509803922, "frac_reward_zero_std": 0.5, "grad_norm": 2.0797480351914714, "kl": 0.12182864546775818, "learning_rate": 2.200410463518704e-08, "loss": 0.0061, "num_tokens": 70527683.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.656259298324585, "sampling/importance_sampling_ratio/mean": 0.999809980392456, "sampling/importance_sampling_ratio/min": 0.5676479339599609, "sampling/sampling_logp_difference/max": 0.5662539005279541, "sampling/sampling_logp_difference/mean": 0.015558083541691303, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 234.390625, "completions/mean_terminated_length": 234.390625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.17076224088668823, "epoch": 2.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.04480063796229846, "kl": 0.0427418127655983, "learning_rate": 2.1795582464830153e-08, "loss": 0.0004, "num_tokens": 70558124.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999072551727295, "sampling/importance_sampling_ratio/min": 0.4287963807582855, "sampling/sampling_logp_difference/max": 0.8467731475830078, "sampling/sampling_logp_difference/mean": 0.011818887665867805, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 158.4375, "completions/mean_terminated_length": 158.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.1844603419303894, "epoch": 2.751225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.5747770439482864, "kl": 0.12585288286209106, "learning_rate": 2.1588031019145636e-08, "loss": 0.0014, "num_tokens": 70584200.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6396639347076416, "sampling/importance_sampling_ratio/mean": 0.999675452709198, "sampling/importance_sampling_ratio/min": 0.1735827922821045, "sampling/sampling_logp_difference/max": 1.7511005401611328, "sampling/sampling_logp_difference/mean": 0.011955867521464825, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 172.640625, "completions/mean_terminated_length": 172.640625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.21045391261577606, "epoch": 2.752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.049648451221509, "kl": 0.06508484482765198, "learning_rate": 2.13814507194498e-08, "loss": 0.0006, "num_tokens": 70612209.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9963213205337524, "sampling/importance_sampling_ratio/mean": 1.0002024173736572, "sampling/importance_sampling_ratio/min": 0.606671154499054, "sampling/sampling_logp_difference/max": 0.6913061141967773, "sampling/sampling_logp_difference/mean": 0.013279307633638382, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 213.3125, "completions/mean_terminated_length": 213.3125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3622591495513916, "epoch": 2.7536764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.2309385357333875, "kl": 0.097495436668396, "learning_rate": 2.1175841985087707e-08, "loss": -0.0041, "num_tokens": 70646645.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5279804468154907, "sampling/importance_sampling_ratio/mean": 0.9995732307434082, "sampling/importance_sampling_ratio/min": 0.4854530394077301, "sampling/sampling_logp_difference/max": 0.722672700881958, "sampling/sampling_logp_difference/mean": 0.0192355178296566, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 205.828125, "completions/mean_terminated_length": 205.828125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.20620039105415344, "epoch": 2.7549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.3278783409445232, "kl": 0.0781605988740921, "learning_rate": 2.097120523343199e-08, "loss": -0.0055, "num_tokens": 70677946.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.601424217224121, "sampling/importance_sampling_ratio/mean": 0.9992896914482117, "sampling/importance_sampling_ratio/min": 0.5440990328788757, "sampling/sampling_logp_difference/max": 0.6086239814758301, "sampling/sampling_logp_difference/mean": 0.013438969850540161, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 187.9375, "completions/mean_terminated_length": 187.9375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.312122106552124, "epoch": 2.756127450980392, "frac_reward_zero_std": 0.25, "grad_norm": 2.517810663801415, "kl": 0.1392800509929657, "learning_rate": 2.076754087988214e-08, "loss": 0.0082, "num_tokens": 70704998.0, "reward": 0.3125, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.9059869050979614, "sampling/importance_sampling_ratio/mean": 1.0000531673431396, "sampling/importance_sampling_ratio/min": 0.61529940366745, "sampling/sampling_logp_difference/max": 0.6449999809265137, "sampling/sampling_logp_difference/mean": 0.016311530023813248, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 214.828125, "completions/mean_terminated_length": 214.828125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.18979710340499878, "epoch": 2.7573529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.1029685330856562, "kl": 0.05746053159236908, "learning_rate": 2.0564849337864122e-08, "loss": -0.0047, "num_tokens": 70736491.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998486042022705, "sampling/importance_sampling_ratio/min": 0.4597283601760864, "sampling/sampling_logp_difference/max": 0.777119517326355, "sampling/sampling_logp_difference/mean": 0.012833474203944206, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 220.234375, "completions/mean_terminated_length": 220.234375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.25403162837028503, "epoch": 2.758578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.6124423023997865, "kl": 0.07223007082939148, "learning_rate": 2.036313101882875e-08, "loss": 0.0606, "num_tokens": 70775386.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001699924468994, "sampling/importance_sampling_ratio/min": 0.32495278120040894, "sampling/sampling_logp_difference/max": 1.1240754127502441, "sampling/sampling_logp_difference/mean": 0.014939755201339722, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 213.171875, "completions/mean_terminated_length": 213.171875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.34486424922943115, "epoch": 2.7598039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.7254100844364808, "kl": 0.125754714012146, "learning_rate": 2.0162386332251648e-08, "loss": -0.0044, "num_tokens": 70809413.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6528112888336182, "sampling/importance_sampling_ratio/mean": 0.999992311000824, "sampling/importance_sampling_ratio/min": 0.4950985610485077, "sampling/sampling_logp_difference/max": 0.7029983997344971, "sampling/sampling_logp_difference/mean": 0.01870739459991455, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 250.265625, "completions/mean_terminated_length": 250.265625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.279003769159317, "epoch": 2.7610294117647056, "frac_reward_zero_std": 0.25, "grad_norm": 1.8117417539800733, "kl": 0.06691594421863556, "learning_rate": 1.9962615685631568e-08, "loss": -0.0738, "num_tokens": 70842790.0, "reward": 0.28125, "reward_std": 0.6833621263504028, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.9848772287368774, "sampling/importance_sampling_ratio/mean": 0.999861478805542, "sampling/importance_sampling_ratio/min": 0.4800865352153778, "sampling/sampling_logp_difference/max": 0.7337889671325684, "sampling/sampling_logp_difference/mean": 0.014689471572637558, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 174.765625, "completions/mean_terminated_length": 174.765625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.16617852449417114, "epoch": 2.7622549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.7688961229273223, "kl": 0.06370288133621216, "learning_rate": 1.976381948449035e-08, "loss": 0.0283, "num_tokens": 70878055.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000295639038086, "sampling/importance_sampling_ratio/min": 0.3053562641143799, "sampling/sampling_logp_difference/max": 1.1862760782241821, "sampling/sampling_logp_difference/mean": 0.012256176210939884, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 212.578125, "completions/mean_terminated_length": 212.578125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3126332759857178, "epoch": 2.763480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.1573843302974975, "kl": 0.09631580114364624, "learning_rate": 1.9565998132371808e-08, "loss": -0.0185, "num_tokens": 70914028.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.879313588142395, "sampling/importance_sampling_ratio/mean": 0.999764084815979, "sampling/importance_sampling_ratio/min": 0.5262352824211121, "sampling/sampling_logp_difference/max": 0.6420068740844727, "sampling/sampling_logp_difference/mean": 0.01587415672838688, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2396133840084076, "epoch": 2.764705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 1.1332077897009292, "kl": 0.08597449213266373, "learning_rate": 1.936915203084055e-08, "loss": -0.0046, "num_tokens": 70948140.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7887362241744995, "sampling/importance_sampling_ratio/mean": 1.0001415014266968, "sampling/importance_sampling_ratio/min": 0.5966588854789734, "sampling/sampling_logp_difference/max": 0.5815093517303467, "sampling/sampling_logp_difference/mean": 0.013765338808298111, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 158.828125, "completions/mean_terminated_length": 158.828125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.25703465938568115, "epoch": 2.7659313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.4201686527609199, "kl": 0.10735650360584259, "learning_rate": 1.9173281579481894e-08, "loss": 0.002, "num_tokens": 70975633.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8322973251342773, "sampling/importance_sampling_ratio/mean": 1.0004141330718994, "sampling/importance_sampling_ratio/min": 0.3579929769039154, "sampling/sampling_logp_difference/max": 1.0272419452667236, "sampling/sampling_logp_difference/mean": 0.01574607938528061, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 155.28125, "completions/mean_terminated_length": 155.28125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.22355003654956818, "epoch": 2.767156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.3193496489083907, "kl": 0.07810181379318237, "learning_rate": 1.897838717590028e-08, "loss": -0.0119, "num_tokens": 71008387.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7434394359588623, "sampling/importance_sampling_ratio/mean": 0.9994719624519348, "sampling/importance_sampling_ratio/min": 0.48811832070350647, "sampling/sampling_logp_difference/max": 0.7171974182128906, "sampling/sampling_logp_difference/mean": 0.012913529761135578, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 197.421875, "completions/mean_terminated_length": 197.421875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.18570518493652344, "epoch": 2.7683823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.04140384901991634, "kl": 0.04976304620504379, "learning_rate": 1.8784469215719077e-08, "loss": 0.0005, "num_tokens": 71040126.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4297953844070435, "sampling/importance_sampling_ratio/mean": 0.9996899962425232, "sampling/importance_sampling_ratio/min": 0.4256052076816559, "sampling/sampling_logp_difference/max": 0.8542431592941284, "sampling/sampling_logp_difference/mean": 0.01033171359449625, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 199.109375, "completions/mean_terminated_length": 199.109375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.24861127138137817, "epoch": 2.769607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.1049193177717076, "kl": 0.09222990274429321, "learning_rate": 1.8591528092579524e-08, "loss": -0.0097, "num_tokens": 71070277.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9989231824874878, "sampling/importance_sampling_ratio/mean": 1.0008349418640137, "sampling/importance_sampling_ratio/min": 0.6117143630981445, "sampling/sampling_logp_difference/max": 0.6926085948944092, "sampling/sampling_logp_difference/mean": 0.014516171999275684, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.29695504903793335, "epoch": 2.7708333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.0262807260800688, "kl": 0.08197468519210815, "learning_rate": 1.8399564198139707e-08, "loss": 0.0013, "num_tokens": 71107221.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6657474040985107, "sampling/importance_sampling_ratio/mean": 0.9994725584983826, "sampling/importance_sampling_ratio/min": 0.36581653356552124, "sampling/sampling_logp_difference/max": 1.0056233406066895, "sampling/sampling_logp_difference/mean": 0.015666469931602478, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 201.34375, "completions/mean_terminated_length": 201.34375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.18237395584583282, "epoch": 2.7720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.04637110740425972, "kl": 0.048668306320905685, "learning_rate": 1.8208577922074308e-08, "loss": 0.0004, "num_tokens": 71136587.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.672214388847351, "sampling/importance_sampling_ratio/mean": 1.0000160932540894, "sampling/importance_sampling_ratio/min": 0.593717098236084, "sampling/sampling_logp_difference/max": 0.5213522911071777, "sampling/sampling_logp_difference/mean": 0.011836504563689232, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 172.171875, "completions/mean_terminated_length": 172.171875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2061919867992401, "epoch": 2.7732843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0663790791649386, "kl": 0.07428238540887833, "learning_rate": 1.8018569652073378e-08, "loss": 0.0007, "num_tokens": 71169798.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006260871887207, "sampling/importance_sampling_ratio/min": 0.5007621049880981, "sampling/sampling_logp_difference/max": 0.7730910778045654, "sampling/sampling_logp_difference/mean": 0.014331132173538208, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.18149055540561676, "epoch": 2.7745098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.9289501476314372, "kl": 0.07153521478176117, "learning_rate": 1.7829539773841608e-08, "loss": 0.054, "num_tokens": 71196558.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000271201133728, "sampling/importance_sampling_ratio/min": 0.4579724669456482, "sampling/sampling_logp_difference/max": 1.1921019554138184, "sampling/sampling_logp_difference/mean": 0.011731035076081753, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 219.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.285845011472702, "epoch": 2.775735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.05058568161150038, "kl": 0.06788494437932968, "learning_rate": 1.7641488671097606e-08, "loss": 0.0007, "num_tokens": 71229466.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9552366733551025, "sampling/importance_sampling_ratio/mean": 1.0000596046447754, "sampling/importance_sampling_ratio/min": 0.5756021738052368, "sampling/sampling_logp_difference/max": 0.6705112457275391, "sampling/sampling_logp_difference/mean": 0.014759533107280731, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 175.4375, "completions/mean_terminated_length": 175.4375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.21992114186286926, "epoch": 2.7769607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.3331153499690847, "kl": 0.07214683294296265, "learning_rate": 1.745441672557335e-08, "loss": 0.0021, "num_tokens": 71257478.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6355787515640259, "sampling/importance_sampling_ratio/mean": 1.000448226928711, "sampling/importance_sampling_ratio/min": 0.396942138671875, "sampling/sampling_logp_difference/max": 0.92396479845047, "sampling/sampling_logp_difference/mean": 0.013438953086733818, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 197.140625, "completions/mean_terminated_length": 197.140625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.22281494736671448, "epoch": 2.778186274509804, "frac_reward_zero_std": 0.25, "grad_norm": 2.038184728011804, "kl": 0.09663940966129303, "learning_rate": 1.7268324317012973e-08, "loss": 0.0078, "num_tokens": 71293583.0, "reward": -0.0625, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.608886480331421, "sampling/importance_sampling_ratio/mean": 0.9999948740005493, "sampling/importance_sampling_ratio/min": 0.5804426074028015, "sampling/sampling_logp_difference/max": 0.5439643859863281, "sampling/sampling_logp_difference/mean": 0.01380915567278862, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 205.640625, "completions/mean_terminated_length": 205.640625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.251699835062027, "epoch": 2.7794117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.816421715670366, "kl": 0.09073509275913239, "learning_rate": 1.7083211823172184e-08, "loss": -0.1407, "num_tokens": 71336360.0, "reward": -0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.664367914199829, "sampling/importance_sampling_ratio/mean": 0.9992032051086426, "sampling/importance_sampling_ratio/min": 0.48935699462890625, "sampling/sampling_logp_difference/max": 0.714663028717041, "sampling/sampling_logp_difference/mean": 0.013966759666800499, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 159.28125, "completions/mean_terminated_length": 159.28125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.23307263851165771, "epoch": 2.780637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.524201781355275, "kl": 0.09997309744358063, "learning_rate": 1.6899079619817792e-08, "loss": -0.0029, "num_tokens": 71367162.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.8080004453659058, "sampling/importance_sampling_ratio/mean": 1.0001411437988281, "sampling/importance_sampling_ratio/min": 0.6400461792945862, "sampling/sampling_logp_difference/max": 0.5922214984893799, "sampling/sampling_logp_difference/mean": 0.0136976707726717, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 198.046875, "completions/mean_terminated_length": 198.046875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.23901107907295227, "epoch": 2.781862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.2901761842056259, "kl": 0.05361397564411163, "learning_rate": 1.6715928080726415e-08, "loss": 0.0052, "num_tokens": 71394269.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.7307579517364502, "sampling/importance_sampling_ratio/mean": 1.000693917274475, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.5485594272613525, "sampling/sampling_logp_difference/mean": 0.01256822794675827, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 193.03125, "completions/mean_terminated_length": 193.03125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2347640097141266, "epoch": 2.7830882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.2367901541403803, "kl": 0.07756893336772919, "learning_rate": 1.653375757768405e-08, "loss": 0.0092, "num_tokens": 71429039.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992195963859558, "sampling/importance_sampling_ratio/min": 0.3659481406211853, "sampling/sampling_logp_difference/max": 1.0052636861801147, "sampling/sampling_logp_difference/mean": 0.016058577224612236, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 206.359375, "completions/mean_terminated_length": 206.359375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.23786799609661102, "epoch": 2.784313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 2.0651149788157532, "kl": 0.11119990795850754, "learning_rate": 1.6352568480485275e-08, "loss": 0.0857, "num_tokens": 71461702.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000898838043213, "sampling/importance_sampling_ratio/min": 0.5441979169845581, "sampling/sampling_logp_difference/max": 0.7225730419158936, "sampling/sampling_logp_difference/mean": 0.014432122930884361, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 234.765625, "completions/mean_terminated_length": 234.765625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.26328566670417786, "epoch": 2.7855392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.76476429463243, "kl": 0.05306101590394974, "learning_rate": 1.6172361156932547e-08, "loss": 0.0296, "num_tokens": 71495927.0, "reward": 0.3125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.7942874431610107, "sampling/importance_sampling_ratio/mean": 0.9999489784240723, "sampling/importance_sampling_ratio/min": 0.6099900603294373, "sampling/sampling_logp_difference/max": 0.5846079587936401, "sampling/sampling_logp_difference/mean": 0.014174356125295162, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 160.859375, "completions/mean_terminated_length": 160.859375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2133684754371643, "epoch": 2.786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.04760407754695877, "kl": 0.08011157065629959, "learning_rate": 1.5993135972835303e-08, "loss": 0.0007, "num_tokens": 71519790.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9553382396697998, "sampling/importance_sampling_ratio/mean": 1.0002846717834473, "sampling/importance_sampling_ratio/min": 0.48663073778152466, "sampling/sampling_logp_difference/max": 0.7202496528625488, "sampling/sampling_logp_difference/mean": 0.014282667078077793, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 226.59375, "completions/mean_terminated_length": 226.59375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2044447660446167, "epoch": 2.7879901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.4896579938687486, "kl": 0.060443829745054245, "learning_rate": 1.581489329200919e-08, "loss": -0.0168, "num_tokens": 71551812.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997121691703796, "sampling/importance_sampling_ratio/min": 0.378704696893692, "sampling/sampling_logp_difference/max": 1.0303916931152344, "sampling/sampling_logp_difference/mean": 0.01223063375800848, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 180.6875, "completions/mean_terminated_length": 180.6875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.28990983963012695, "epoch": 2.7892156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.312920820950326, "kl": 0.14007467031478882, "learning_rate": 1.5637633476275724e-08, "loss": 0.0153, "num_tokens": 71579408.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6084250211715698, "sampling/importance_sampling_ratio/mean": 1.000478744506836, "sampling/importance_sampling_ratio/min": 0.3463849425315857, "sampling/sampling_logp_difference/max": 1.0602045059204102, "sampling/sampling_logp_difference/mean": 0.01548030786216259, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.18988262116909027, "epoch": 2.7904411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.0685159768950878, "kl": 0.05716770514845848, "learning_rate": 1.5461356885461075e-08, "loss": 0.0186, "num_tokens": 71607232.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6598341464996338, "sampling/importance_sampling_ratio/mean": 1.0002659559249878, "sampling/importance_sampling_ratio/min": 0.6066067218780518, "sampling/sampling_logp_difference/max": 0.5067176818847656, "sampling/sampling_logp_difference/mean": 0.010263778269290924, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 257.953125, "completions/mean_terminated_length": 257.953125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.28556105494499207, "epoch": 2.7916666666666665, "frac_reward_zero_std": 0.5, "grad_norm": 1.510623953144217, "kl": 0.1083979457616806, "learning_rate": 1.528606387739545e-08, "loss": 0.0404, "num_tokens": 71649165.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999854564666748, "sampling/importance_sampling_ratio/min": 0.49440720677375793, "sampling/sampling_logp_difference/max": 0.7608175277709961, "sampling/sampling_logp_difference/mean": 0.015845760703086853, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 211.0625, "completions/mean_terminated_length": 211.0625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.22970786690711975, "epoch": 2.792892156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.6020697058076987, "kl": 0.10661774128675461, "learning_rate": 1.5111754807912546e-08, "loss": 0.0193, "num_tokens": 71677921.0, "reward": 0.40625, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5262553691864014, "sampling/importance_sampling_ratio/mean": 1.0001559257507324, "sampling/importance_sampling_ratio/min": 0.43244117498397827, "sampling/sampling_logp_difference/max": 0.8383089900016785, "sampling/sampling_logp_difference/mean": 0.012312108650803566, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 178.96875, "completions/mean_terminated_length": 178.96875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.15283872187137604, "epoch": 2.7941176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.03991502990475642, "kl": 0.04768148809671402, "learning_rate": 1.493843003084888e-08, "loss": 0.0005, "num_tokens": 71711263.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5243661403656006, "sampling/importance_sampling_ratio/mean": 1.001168966293335, "sampling/importance_sampling_ratio/min": 0.6410229206085205, "sampling/sampling_logp_difference/max": 0.44469010829925537, "sampling/sampling_logp_difference/mean": 0.010422519408166409, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 150.578125, "completions/mean_terminated_length": 150.578125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.22832804918289185, "epoch": 2.795343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.3769658370315165, "kl": 0.07298359274864197, "learning_rate": 1.4766089898042677e-08, "loss": -0.0075, "num_tokens": 71737188.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5746679306030273, "sampling/importance_sampling_ratio/mean": 1.0000483989715576, "sampling/importance_sampling_ratio/min": 0.5372995734214783, "sampling/sampling_logp_difference/max": 0.6211994886398315, "sampling/sampling_logp_difference/mean": 0.013986149802803993, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 169.59375, "completions/mean_terminated_length": 169.59375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18246373534202576, "epoch": 2.7965686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.07112009120357819, "kl": 0.05187829211354256, "learning_rate": 1.4594734759333482e-08, "loss": 0.0005, "num_tokens": 71766746.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004734992980957, "sampling/importance_sampling_ratio/min": 0.6158343553543091, "sampling/sampling_logp_difference/max": 1.4400757551193237, "sampling/sampling_logp_difference/mean": 0.011682495474815369, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 151.296875, "completions/mean_terminated_length": 151.296875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2154589593410492, "epoch": 2.797794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.07547142912461965, "kl": 0.08189049363136292, "learning_rate": 1.4424364962561386e-08, "loss": 0.0008, "num_tokens": 71794749.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.762123703956604, "sampling/importance_sampling_ratio/mean": 0.9996479749679565, "sampling/importance_sampling_ratio/min": 0.5038022994995117, "sampling/sampling_logp_difference/max": 0.6855714321136475, "sampling/sampling_logp_difference/mean": 0.012780768796801567, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 152.1875, "completions/mean_terminated_length": 152.1875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.26381582021713257, "epoch": 2.799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.9889896342701454, "kl": 0.13400951027870178, "learning_rate": 1.4254980853566246e-08, "loss": 0.0213, "num_tokens": 71822761.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5837942361831665, "sampling/importance_sampling_ratio/mean": 1.000157117843628, "sampling/importance_sampling_ratio/min": 0.5473312735557556, "sampling/sampling_logp_difference/max": 0.6027010679244995, "sampling/sampling_logp_difference/mean": 0.015911083668470383, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 166.046875, "completions/mean_terminated_length": 166.046875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2303771823644638, "epoch": 2.8002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.3140962471762698, "kl": 0.13876497745513916, "learning_rate": 1.4086582776187239e-08, "loss": -0.0087, "num_tokens": 71853676.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5536686182022095, "sampling/importance_sampling_ratio/mean": 0.9994738101959229, "sampling/importance_sampling_ratio/min": 0.49137696623802185, "sampling/sampling_logp_difference/max": 0.7105436325073242, "sampling/sampling_logp_difference/mean": 0.014124227687716484, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 207.421875, "completions/mean_terminated_length": 207.421875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2937871813774109, "epoch": 2.8014705882352944, "frac_reward_zero_std": 0.5, "grad_norm": 2.017779204162288, "kl": 0.08879870176315308, "learning_rate": 1.3919171072261537e-08, "loss": 0.0605, "num_tokens": 71887911.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6441470384597778, "sampling/importance_sampling_ratio/mean": 0.9999390244483948, "sampling/importance_sampling_ratio/min": 0.5000088214874268, "sampling/sampling_logp_difference/max": 0.6931295394897461, "sampling/sampling_logp_difference/mean": 0.016515297815203667, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 209.140625, "completions/mean_terminated_length": 209.140625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.21904979646205902, "epoch": 2.8026960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.194619906314769, "kl": 0.05970887839794159, "learning_rate": 1.3752746081624467e-08, "loss": 0.0022, "num_tokens": 71921472.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6574615240097046, "sampling/importance_sampling_ratio/mean": 0.9997144937515259, "sampling/importance_sampling_ratio/min": 0.6155778169631958, "sampling/sampling_logp_difference/max": 0.5052871704101562, "sampling/sampling_logp_difference/mean": 0.013201612047851086, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 234.796875, "completions/mean_terminated_length": 234.796875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1851789802312851, "epoch": 2.803921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0501740668660753, "kl": 0.06818913668394089, "learning_rate": 1.3587308142108178e-08, "loss": 0.0005, "num_tokens": 71955075.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5082266330718994, "sampling/importance_sampling_ratio/mean": 0.9994621872901917, "sampling/importance_sampling_ratio/min": 0.2729172706604004, "sampling/sampling_logp_difference/max": 1.2985866069793701, "sampling/sampling_logp_difference/mean": 0.012057261541485786, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 153.21875, "completions/mean_terminated_length": 153.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20215308666229248, "epoch": 2.8051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.04414976885733, "kl": 0.07899628579616547, "learning_rate": 1.3422857589541148e-08, "loss": -0.0029, "num_tokens": 71979233.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4445340633392334, "sampling/importance_sampling_ratio/mean": 0.9998143315315247, "sampling/importance_sampling_ratio/min": 0.49491244554519653, "sampling/sampling_logp_difference/max": 0.7033743858337402, "sampling/sampling_logp_difference/mean": 0.012561185285449028, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 185.703125, "completions/mean_terminated_length": 185.703125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.28129348158836365, "epoch": 2.806372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.0391032427759417, "kl": 0.16123129427433014, "learning_rate": 1.3259394757747677e-08, "loss": -0.0029, "num_tokens": 72005262.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.8564090728759766, "sampling/importance_sampling_ratio/mean": 1.0004281997680664, "sampling/importance_sampling_ratio/min": 0.5909593105316162, "sampling/sampling_logp_difference/max": 0.6186439990997314, "sampling/sampling_logp_difference/mean": 0.015385773964226246, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 236.90625, "completions/mean_terminated_length": 236.90625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.21015983819961548, "epoch": 2.8075980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.5423523965562058, "kl": 0.05395539849996567, "learning_rate": 1.3096919978546838e-08, "loss": 0.0334, "num_tokens": 72035608.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8786742687225342, "sampling/importance_sampling_ratio/mean": 1.000143051147461, "sampling/importance_sampling_ratio/min": 0.5484020113945007, "sampling/sampling_logp_difference/max": 0.6305663585662842, "sampling/sampling_logp_difference/mean": 0.012865163385868073, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 227.421875, "completions/mean_terminated_length": 227.421875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.1998668611049652, "epoch": 2.8088235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.7835263932126011, "kl": 0.06374235451221466, "learning_rate": 1.2935433581752365e-08, "loss": -0.0049, "num_tokens": 72067299.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.550827145576477, "sampling/importance_sampling_ratio/mean": 0.9996589422225952, "sampling/importance_sampling_ratio/min": 0.6136133670806885, "sampling/sampling_logp_difference/max": 0.4883902072906494, "sampling/sampling_logp_difference/mean": 0.012076803483068943, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 230.65625, "completions/mean_terminated_length": 230.65625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.19293788075447083, "epoch": 2.810049019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.1650726238622964, "kl": 0.05070199817419052, "learning_rate": 1.2774935895171091e-08, "loss": 0.0186, "num_tokens": 72097997.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.826583981513977, "sampling/importance_sampling_ratio/mean": 1.0000672340393066, "sampling/importance_sampling_ratio/min": 0.5401414036750793, "sampling/sampling_logp_difference/max": 0.6159243583679199, "sampling/sampling_logp_difference/mean": 0.011585809290409088, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 204.53125, "completions/mean_terminated_length": 204.53125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.1433851271867752, "epoch": 2.811274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 1.2036905104523585, "kl": 0.060173049569129944, "learning_rate": 1.2615427244603405e-08, "loss": 0.0364, "num_tokens": 72125951.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5772333145141602, "sampling/importance_sampling_ratio/mean": 1.0000299215316772, "sampling/importance_sampling_ratio/min": 0.4926629066467285, "sampling/sampling_logp_difference/max": 0.7079300880432129, "sampling/sampling_logp_difference/mean": 0.010021264664828777, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 201.234375, "completions/mean_terminated_length": 201.234375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.20590944588184357, "epoch": 2.8125, "frac_reward_zero_std": 0.75, "grad_norm": 0.8979533208446774, "kl": 0.08708874881267548, "learning_rate": 1.2456907953841633e-08, "loss": -0.0018, "num_tokens": 72154942.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9617762565612793, "sampling/importance_sampling_ratio/mean": 1.0001060962677002, "sampling/importance_sampling_ratio/min": 0.5467776656150818, "sampling/sampling_logp_difference/max": 0.6738502979278564, "sampling/sampling_logp_difference/mean": 0.011773111298680305, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 208.59375, "completions/mean_terminated_length": 208.59375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.27260729670524597, "epoch": 2.813725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.5072733075937548, "kl": 0.07039877027273178, "learning_rate": 1.2299378344669986e-08, "loss": 0.0132, "num_tokens": 72183188.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.9314006567001343, "sampling/importance_sampling_ratio/mean": 0.9993040561676025, "sampling/importance_sampling_ratio/min": 0.4164551794528961, "sampling/sampling_logp_difference/max": 0.8759764432907104, "sampling/sampling_logp_difference/mean": 0.015850547701120377, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 255.90625, "completions/mean_terminated_length": 255.90625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.257384717464447, "epoch": 2.814950980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.496029110977191, "kl": 0.06804078817367554, "learning_rate": 1.2142838736863559e-08, "loss": -0.0224, "num_tokens": 72214878.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6147757768630981, "sampling/importance_sampling_ratio/mean": 1.0000439882278442, "sampling/importance_sampling_ratio/min": 0.4597201943397522, "sampling/sampling_logp_difference/max": 0.777137279510498, "sampling/sampling_logp_difference/mean": 0.014001351781189442, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 147.59375, "completions/mean_terminated_length": 147.59375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2722303569316864, "epoch": 2.8161764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 1.2582469630512934, "kl": 0.13115857541561127, "learning_rate": 1.1987289448187777e-08, "loss": 0.0013, "num_tokens": 72242196.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000063419342041, "sampling/importance_sampling_ratio/min": 0.3725292980670929, "sampling/sampling_logp_difference/max": 0.9874396324157715, "sampling/sampling_logp_difference/mean": 0.01594376750290394, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.2505417466163635, "epoch": 2.8174019607843137, "frac_reward_zero_std": 0.25, "grad_norm": 2.1754835273744177, "kl": 0.06322623044252396, "learning_rate": 1.183273079439795e-08, "loss": -0.042, "num_tokens": 72276604.0, "reward": 0.03125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6416311264038086, "sampling/importance_sampling_ratio/mean": 0.9998761415481567, "sampling/importance_sampling_ratio/min": 0.5535147190093994, "sampling/sampling_logp_difference/max": 0.5914669036865234, "sampling/sampling_logp_difference/mean": 0.01484741736203432, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 184.609375, "completions/mean_terminated_length": 184.609375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2452416718006134, "epoch": 2.818627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.045131944661177266, "kl": 0.07852520793676376, "learning_rate": 1.167916308923822e-08, "loss": 0.0008, "num_tokens": 72310803.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8235124349594116, "sampling/importance_sampling_ratio/mean": 1.0006556510925293, "sampling/importance_sampling_ratio/min": 0.5999748110771179, "sampling/sampling_logp_difference/max": 0.6007645130157471, "sampling/sampling_logp_difference/mean": 0.013840295374393463, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 198.3125, "completions/mean_terminated_length": 198.3125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.17873989045619965, "epoch": 2.8198529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.3366842538078287, "kl": 0.077562615275383, "learning_rate": 1.152658664444145e-08, "loss": 0.044, "num_tokens": 72342071.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9442808628082275, "sampling/importance_sampling_ratio/mean": 1.000274896621704, "sampling/importance_sampling_ratio/min": 0.3858293294906616, "sampling/sampling_logp_difference/max": 0.9523601531982422, "sampling/sampling_logp_difference/mean": 0.012566540390253067, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 192.65625, "completions/mean_terminated_length": 192.65625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24401333928108215, "epoch": 2.821078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.03960337741431701, "kl": 0.06033939868211746, "learning_rate": 1.1375001769727999e-08, "loss": 0.0006, "num_tokens": 72373649.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.649314045906067, "sampling/importance_sampling_ratio/mean": 0.9994627237319946, "sampling/importance_sampling_ratio/min": 0.6040035486221313, "sampling/sampling_logp_difference/max": 0.5041751861572266, "sampling/sampling_logp_difference/mean": 0.0143990283831954, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 230.015625, "completions/mean_terminated_length": 230.015625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.24470923840999603, "epoch": 2.8223039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.4262812566610688, "kl": 0.07444335520267487, "learning_rate": 1.1224408772805671e-08, "loss": 0.0377, "num_tokens": 72407570.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9373067617416382, "sampling/importance_sampling_ratio/mean": 1.0001022815704346, "sampling/importance_sampling_ratio/min": 0.5676461458206177, "sampling/sampling_logp_difference/max": 0.6612987518310547, "sampling/sampling_logp_difference/mean": 0.014205005019903183, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 230.21875, "completions/mean_terminated_length": 230.21875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2483881264925003, "epoch": 2.8235294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.570681491857653, "kl": 0.08415378630161285, "learning_rate": 1.1074807959368715e-08, "loss": -0.005, "num_tokens": 72438672.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6552093029022217, "sampling/importance_sampling_ratio/mean": 1.0005154609680176, "sampling/importance_sampling_ratio/min": 0.5130184888839722, "sampling/sampling_logp_difference/max": 0.6674433946609497, "sampling/sampling_logp_difference/mean": 0.014159232378005981, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 211.140625, "completions/mean_terminated_length": 211.140625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2508814036846161, "epoch": 2.8247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.7507579893115275, "kl": 0.06842119991779327, "learning_rate": 1.0926199633097154e-08, "loss": -0.0181, "num_tokens": 72469945.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000756978988647, "sampling/importance_sampling_ratio/min": 0.44627147912979126, "sampling/sampling_logp_difference/max": 0.9812498092651367, "sampling/sampling_logp_difference/mean": 0.014487473294138908, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.25903528928756714, "epoch": 2.825980392156863, "frac_reward_zero_std": 0.25, "grad_norm": 2.3158324343348116, "kl": 0.10344567149877548, "learning_rate": 1.0778584095656685e-08, "loss": -0.038, "num_tokens": 72496241.0, "reward": 0.5, "reward_std": 0.5879635810852051, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5469341278076172, "sampling/importance_sampling_ratio/mean": 1.0004578828811646, "sampling/importance_sampling_ratio/min": 0.46739014983177185, "sampling/sampling_logp_difference/max": 0.7605909109115601, "sampling/sampling_logp_difference/mean": 0.016178598627448082, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 195.578125, "completions/mean_terminated_length": 195.578125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3704570531845093, "epoch": 2.827205882352941, "frac_reward_zero_std": 0.25, "grad_norm": 1.9893595155209258, "kl": 0.14270761609077454, "learning_rate": 1.0631961646697384e-08, "loss": -0.001, "num_tokens": 72532166.0, "reward": 0.46875, "reward_std": 0.7348873615264893, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999090433120728, "sampling/importance_sampling_ratio/min": 0.5401919484138489, "sampling/sampling_logp_difference/max": 0.9278788566589355, "sampling/sampling_logp_difference/mean": 0.020588595420122147, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 214.28125, "completions/mean_terminated_length": 214.28125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.23848213255405426, "epoch": 2.8284313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.09189250060888837, "kl": 0.08650361001491547, "learning_rate": 1.0486332583853564e-08, "loss": 0.0008, "num_tokens": 72567624.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001864433288574, "sampling/importance_sampling_ratio/min": 0.393848717212677, "sampling/sampling_logp_difference/max": 0.931788444519043, "sampling/sampling_logp_difference/mean": 0.013897368684411049, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 158.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.26923668384552, "epoch": 2.829656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.517856496680662, "kl": 0.09672123193740845, "learning_rate": 1.0341697202742971e-08, "loss": -0.0092, "num_tokens": 72594488.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6091969013214111, "sampling/importance_sampling_ratio/mean": 1.0006260871887207, "sampling/importance_sampling_ratio/min": 0.6420642733573914, "sampling/sampling_logp_difference/max": 0.4757351875305176, "sampling/sampling_logp_difference/mean": 0.014627790078520775, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 210.265625, "completions/mean_terminated_length": 210.265625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.26740381121635437, "epoch": 2.8308823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 1.216964226093934, "kl": 0.20067372918128967, "learning_rate": 1.0198055796966253e-08, "loss": 0.0058, "num_tokens": 72633081.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997767210006714, "sampling/importance_sampling_ratio/min": 0.48305249214172363, "sampling/sampling_logp_difference/max": 0.7920181751251221, "sampling/sampling_logp_difference/mean": 0.016341229900717735, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 157.4375, "completions/mean_terminated_length": 157.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24633532762527466, "epoch": 2.832107843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.5443078259153119, "kl": 0.13186615705490112, "learning_rate": 1.0055408658106446e-08, "loss": 0.0013, "num_tokens": 72660901.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6306818723678589, "sampling/importance_sampling_ratio/mean": 0.9997966885566711, "sampling/importance_sampling_ratio/min": 0.38238391280174255, "sampling/sampling_logp_difference/max": 0.9613301753997803, "sampling/sampling_logp_difference/mean": 0.016728542745113373, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 235.171875, "completions/mean_terminated_length": 235.171875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.24795357882976532, "epoch": 2.8333333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.466908573722674, "kl": 0.056717436760663986, "learning_rate": 9.913756075728086e-09, "loss": 0.01, "num_tokens": 72693408.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7994662523269653, "sampling/importance_sampling_ratio/mean": 1.0001740455627441, "sampling/importance_sampling_ratio/min": 0.6203963756561279, "sampling/sampling_logp_difference/max": 0.5874900817871094, "sampling/sampling_logp_difference/mean": 0.01397133432328701, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 182.296875, "completions/mean_terminated_length": 182.296875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.313896119594574, "epoch": 2.8345588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.7254654486321284, "kl": 0.10548032075166702, "learning_rate": 9.77309833737705e-09, "loss": -0.0251, "num_tokens": 72724147.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.71084725856781, "sampling/importance_sampling_ratio/mean": 1.0006495714187622, "sampling/importance_sampling_ratio/min": 0.4789983928203583, "sampling/sampling_logp_difference/max": 0.7360580563545227, "sampling/sampling_logp_difference/mean": 0.01815841719508171, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 200.140625, "completions/mean_terminated_length": 200.140625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.21204930543899536, "epoch": 2.8357843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 1.9665245001403913, "kl": 0.0741671472787857, "learning_rate": 9.633435728579553e-09, "loss": 0.0548, "num_tokens": 72762476.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6213301420211792, "sampling/importance_sampling_ratio/mean": 0.9997785091400146, "sampling/importance_sampling_ratio/min": 0.47327375411987305, "sampling/sampling_logp_difference/max": 0.7480813264846802, "sampling/sampling_logp_difference/mean": 0.013232581317424774, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 159.359375, "completions/mean_terminated_length": 159.359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.1839318573474884, "epoch": 2.8370098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.07242427594017536, "kl": 0.07046204805374146, "learning_rate": 9.494768532841868e-09, "loss": 0.0007, "num_tokens": 72788339.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996984004974365, "sampling/importance_sampling_ratio/min": 0.452145516872406, "sampling/sampling_logp_difference/max": 0.7937512397766113, "sampling/sampling_logp_difference/mean": 0.012966913171112537, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.29241546988487244, "epoch": 2.838235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.2005985580410745, "kl": 0.10323096811771393, "learning_rate": 9.357097031649664e-09, "loss": 0.0064, "num_tokens": 72827155.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002491474151611, "sampling/importance_sampling_ratio/min": 0.4755294620990753, "sampling/sampling_logp_difference/max": 0.7449169158935547, "sampling/sampling_logp_difference/mean": 0.01652509719133377, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 207.546875, "completions/mean_terminated_length": 207.546875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2902880012989044, "epoch": 2.8394607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.05213254317795866, "kl": 0.09981686621904373, "learning_rate": 9.22042150446728e-09, "loss": 0.0009, "num_tokens": 72860934.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9124470949172974, "sampling/importance_sampling_ratio/mean": 0.9997761249542236, "sampling/importance_sampling_ratio/min": 0.5358520746231079, "sampling/sampling_logp_difference/max": 0.648383617401123, "sampling/sampling_logp_difference/mean": 0.015693334862589836, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.15571218729019165, "epoch": 2.840686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.06519853609540806, "kl": 0.05649857223033905, "learning_rate": 9.084742228737564e-09, "loss": 0.0006, "num_tokens": 72886786.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9490232467651367, "sampling/importance_sampling_ratio/mean": 0.9995454549789429, "sampling/importance_sampling_ratio/min": 0.1234641969203949, "sampling/sampling_logp_difference/max": 2.091804027557373, "sampling/sampling_logp_difference/mean": 0.011238237842917442, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2267010509967804, "epoch": 2.8419117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.4949329142363874, "kl": 0.05994710326194763, "learning_rate": 8.95005947988059e-09, "loss": -0.0171, "num_tokens": 72921898.0, "reward": 0.28125, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.483286738395691, "sampling/importance_sampling_ratio/mean": 0.9996479749679565, "sampling/importance_sampling_ratio/min": 0.6147379279136658, "sampling/sampling_logp_difference/max": 0.48655927181243896, "sampling/sampling_logp_difference/mean": 0.012886783108115196, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.22738264501094818, "epoch": 2.843137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.08576645963729009, "kl": 0.084224171936512, "learning_rate": 8.816373531293941e-09, "loss": 0.0008, "num_tokens": 72961258.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8171416521072388, "sampling/importance_sampling_ratio/mean": 1.0003807544708252, "sampling/importance_sampling_ratio/min": 0.4550248384475708, "sampling/sampling_logp_difference/max": 0.7874033451080322, "sampling/sampling_logp_difference/mean": 0.014613781124353409, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2447119653224945, "epoch": 2.844362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.7406505358690478, "kl": 0.12149256467819214, "learning_rate": 8.683684654351597e-09, "loss": 0.0162, "num_tokens": 72991970.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4657742977142334, "sampling/importance_sampling_ratio/mean": 1.0001851320266724, "sampling/importance_sampling_ratio/min": 0.5134783983230591, "sampling/sampling_logp_difference/max": 0.6665472984313965, "sampling/sampling_logp_difference/mean": 0.01529346127063036, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 170.390625, "completions/mean_terminated_length": 170.390625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.20992591977119446, "epoch": 2.8455882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.10045521240962015, "kl": 0.09384244680404663, "learning_rate": 8.551993118403656e-09, "loss": 0.0009, "num_tokens": 73026075.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7112091779708862, "sampling/importance_sampling_ratio/mean": 0.9992998838424683, "sampling/importance_sampling_ratio/min": 0.6041069626808167, "sampling/sampling_logp_difference/max": 0.5372002124786377, "sampling/sampling_logp_difference/mean": 0.012351354584097862, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 236.484375, "completions/mean_terminated_length": 236.484375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2420966625213623, "epoch": 2.846813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.9732764114682843, "kl": 0.061010293662548065, "learning_rate": 8.4212991907755e-09, "loss": 0.0168, "num_tokens": 73060618.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6221625804901123, "sampling/importance_sampling_ratio/mean": 1.0001819133758545, "sampling/importance_sampling_ratio/min": 0.3637353777885437, "sampling/sampling_logp_difference/max": 1.0113286972045898, "sampling/sampling_logp_difference/mean": 0.01266421191394329, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2921450734138489, "epoch": 2.8480392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.6091477379720234, "kl": 0.16933825612068176, "learning_rate": 8.291603136767521e-09, "loss": -0.0077, "num_tokens": 73086742.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997191429138184, "sampling/importance_sampling_ratio/min": 0.5376563668251038, "sampling/sampling_logp_difference/max": 0.713315486907959, "sampling/sampling_logp_difference/mean": 0.01770794577896595, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 164.578125, "completions/mean_terminated_length": 164.578125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.20299825072288513, "epoch": 2.849264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.5280261820469567, "kl": 0.08744926005601883, "learning_rate": 8.16290521965457e-09, "loss": 0.016, "num_tokens": 73111387.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000653266906738, "sampling/importance_sampling_ratio/min": 0.6009537577629089, "sampling/sampling_logp_difference/max": 0.6948471069335938, "sampling/sampling_logp_difference/mean": 0.013114454224705696, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 149.890625, "completions/mean_terminated_length": 149.890625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.19350628554821014, "epoch": 2.8504901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.05586272455051703, "kl": 0.06838033348321915, "learning_rate": 8.035205700685165e-09, "loss": 0.0007, "num_tokens": 73137668.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004388093948364, "sampling/importance_sampling_ratio/min": 0.14207322895526886, "sampling/sampling_logp_difference/max": 1.9514126777648926, "sampling/sampling_logp_difference/mean": 0.012857876718044281, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 212.40625, "completions/mean_terminated_length": 212.40625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.22763755917549133, "epoch": 2.8517156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 1.6426674429837307, "kl": 0.06477226316928864, "learning_rate": 7.908504839081342e-09, "loss": -0.0483, "num_tokens": 73168510.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.7704322338104248, "sampling/importance_sampling_ratio/mean": 0.9998475909233093, "sampling/importance_sampling_ratio/min": 0.6149999499320984, "sampling/sampling_logp_difference/max": 0.5712237358093262, "sampling/sampling_logp_difference/mean": 0.013438526540994644, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24785181879997253, "epoch": 2.8529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.906870275441614, "kl": 0.11384504288434982, "learning_rate": 7.7828028920377e-09, "loss": 0.026, "num_tokens": 73201878.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.7291810512542725, "sampling/importance_sampling_ratio/mean": 0.9998856782913208, "sampling/importance_sampling_ratio/min": 0.4812855124473572, "sampling/sampling_logp_difference/max": 0.7312946319580078, "sampling/sampling_logp_difference/mean": 0.01550037320703268, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.22001373767852783, "epoch": 2.8541666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.056442269345327326, "kl": 0.06984420120716095, "learning_rate": 7.658100114721344e-09, "loss": 0.0007, "num_tokens": 73230206.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5333000421524048, "sampling/importance_sampling_ratio/mean": 0.9998418092727661, "sampling/importance_sampling_ratio/min": 0.6378348469734192, "sampling/sampling_logp_difference/max": 0.449675977230072, "sampling/sampling_logp_difference/mean": 0.012991837225854397, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 233.265625, "completions/mean_terminated_length": 233.265625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2905122637748718, "epoch": 2.855392156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.9336762011066053, "kl": 0.1303703337907791, "learning_rate": 7.534396760270956e-09, "loss": 0.0308, "num_tokens": 73266783.0, "reward": 0.1875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6623878479003906, "sampling/importance_sampling_ratio/mean": 1.000367283821106, "sampling/importance_sampling_ratio/min": 0.4797719717025757, "sampling/sampling_logp_difference/max": 0.7344443798065186, "sampling/sampling_logp_difference/mean": 0.015111393295228481, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 242.015625, "completions/mean_terminated_length": 242.015625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2494461089372635, "epoch": 2.8566176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.436034974723783, "kl": 0.06503041088581085, "learning_rate": 7.411693079796499e-09, "loss": 0.0066, "num_tokens": 73299808.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996614456176758, "sampling/importance_sampling_ratio/min": 0.4776008725166321, "sampling/sampling_logp_difference/max": 0.7389798760414124, "sampling/sampling_logp_difference/mean": 0.01508298609405756, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 161.734375, "completions/mean_terminated_length": 161.734375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2661849558353424, "epoch": 2.857843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.5369105222747956, "kl": 0.10893932729959488, "learning_rate": 7.289989322378731e-09, "loss": -0.0015, "num_tokens": 73328111.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5277687311172485, "sampling/importance_sampling_ratio/mean": 0.9992194175720215, "sampling/importance_sampling_ratio/min": 0.21387483179569244, "sampling/sampling_logp_difference/max": 1.5423643589019775, "sampling/sampling_logp_difference/mean": 0.017198316752910614, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 174.28125, "completions/mean_terminated_length": 174.28125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24874961376190186, "epoch": 2.8590686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.4183432578176454, "kl": 0.09126145392656326, "learning_rate": 7.169285735068531e-09, "loss": 0.0184, "num_tokens": 73359169.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.000009298324585, "sampling/importance_sampling_ratio/min": 0.5815569758415222, "sampling/sampling_logp_difference/max": 0.542046308517456, "sampling/sampling_logp_difference/mean": 0.013274673372507095, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 154.234375, "completions/mean_terminated_length": 154.234375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2339939922094345, "epoch": 2.860294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.4186821388430984, "kl": 0.10646519064903259, "learning_rate": 7.049582562886513e-09, "loss": 0.0014, "num_tokens": 73382592.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6207294464111328, "sampling/importance_sampling_ratio/mean": 1.0001579523086548, "sampling/importance_sampling_ratio/min": 0.6015142202377319, "sampling/sampling_logp_difference/max": 0.5083050727844238, "sampling/sampling_logp_difference/mean": 0.013911631889641285, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 243.546875, "completions/mean_terminated_length": 243.546875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.27443066239356995, "epoch": 2.861519607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.5596873343135422, "kl": 0.06345244497060776, "learning_rate": 6.930880048822529e-09, "loss": 0.0419, "num_tokens": 73415219.0, "reward": 0.34375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5370264053344727, "sampling/importance_sampling_ratio/mean": 1.0004478693008423, "sampling/importance_sampling_ratio/min": 0.4976615905761719, "sampling/sampling_logp_difference/max": 0.6978349685668945, "sampling/sampling_logp_difference/mean": 0.014734284020960331, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.28680717945098877, "epoch": 2.8627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.9868483285711989, "kl": 0.15480905771255493, "learning_rate": 6.813178433835221e-09, "loss": 0.0318, "num_tokens": 73436355.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5985099077224731, "sampling/importance_sampling_ratio/mean": 1.0001389980316162, "sampling/importance_sampling_ratio/min": 0.3810408413410187, "sampling/sampling_logp_difference/max": 0.9648487567901611, "sampling/sampling_logp_difference/mean": 0.015054525807499886, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 173.515625, "completions/mean_terminated_length": 173.515625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.24999219179153442, "epoch": 2.8639705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 1.253634920109201, "kl": 0.09357550740242004, "learning_rate": 6.696477956851354e-09, "loss": 0.011, "num_tokens": 73468244.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996324777603149, "sampling/importance_sampling_ratio/min": 0.3114042282104492, "sampling/sampling_logp_difference/max": 1.166663408279419, "sampling/sampling_logp_difference/mean": 0.015080911107361317, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 143.8125, "completions/mean_terminated_length": 143.8125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.18993791937828064, "epoch": 2.8651960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 2.189872256789966, "kl": 0.07720914483070374, "learning_rate": 6.580778854765489e-09, "loss": 0.0085, "num_tokens": 73497880.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6078691482543945, "sampling/importance_sampling_ratio/mean": 0.999321460723877, "sampling/importance_sampling_ratio/min": 0.4330565929412842, "sampling/sampling_logp_difference/max": 0.8368868827819824, "sampling/sampling_logp_difference/mean": 0.013005631975829601, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 243.859375, "completions/mean_terminated_length": 243.859375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.28375160694122314, "epoch": 2.866421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.1260263439134148, "kl": 0.08156538009643555, "learning_rate": 6.4660813624395905e-09, "loss": 0.0083, "num_tokens": 73534111.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5071581602096558, "sampling/importance_sampling_ratio/mean": 0.9996317625045776, "sampling/importance_sampling_ratio/min": 0.40564998984336853, "sampling/sampling_logp_difference/max": 0.9022645950317383, "sampling/sampling_logp_difference/mean": 0.01562865637242794, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 201.3125, "completions/mean_terminated_length": 201.3125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2539217174053192, "epoch": 2.8676470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.0983780464082489, "kl": 0.07987286150455475, "learning_rate": 6.3523857127021905e-09, "loss": 0.0074, "num_tokens": 73566035.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000715255737305, "sampling/importance_sampling_ratio/min": 0.11404351145029068, "sampling/sampling_logp_difference/max": 2.171175241470337, "sampling/sampling_logp_difference/mean": 0.014142842963337898, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 189.703125, "completions/mean_terminated_length": 189.703125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.19674170017242432, "epoch": 2.868872549019608, "frac_reward_zero_std": 0.75, "grad_norm": 1.4984189584691552, "kl": 0.07396071404218674, "learning_rate": 6.239692136348284e-09, "loss": 0.0059, "num_tokens": 73598736.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.8570950031280518, "sampling/importance_sampling_ratio/mean": 1.0001121759414673, "sampling/importance_sampling_ratio/min": 0.3313627541065216, "sampling/sampling_logp_difference/max": 1.104541540145874, "sampling/sampling_logp_difference/mean": 0.01367383636534214, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 195.09375, "completions/mean_terminated_length": 195.09375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.25682249665260315, "epoch": 2.8700980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.7423779275368756, "kl": 0.08152695000171661, "learning_rate": 6.12800086213866e-09, "loss": -0.0015, "num_tokens": 73630582.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7536267042160034, "sampling/importance_sampling_ratio/mean": 1.0003193616867065, "sampling/importance_sampling_ratio/min": 0.5676500201225281, "sampling/sampling_logp_difference/max": 0.566250205039978, "sampling/sampling_logp_difference/mean": 0.013737764209508896, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 209.28125, "completions/mean_terminated_length": 209.28125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3201642632484436, "epoch": 2.8713235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.1829241098472207, "kl": 0.08646942675113678, "learning_rate": 6.017312116799566e-09, "loss": 0.069, "num_tokens": 73661016.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6496838331222534, "sampling/importance_sampling_ratio/mean": 1.000123381614685, "sampling/importance_sampling_ratio/min": 0.5428546071052551, "sampling/sampling_logp_difference/max": 0.6109137535095215, "sampling/sampling_logp_difference/mean": 0.01596347987651825, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 139.421875, "completions/mean_terminated_length": 139.421875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.1933838427066803, "epoch": 2.872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.1150050333548559, "kl": 0.07972557842731476, "learning_rate": 5.907626125022158e-09, "loss": 0.0008, "num_tokens": 73686451.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6496334075927734, "sampling/importance_sampling_ratio/mean": 0.9993104934692383, "sampling/importance_sampling_ratio/min": 0.5260617733001709, "sampling/sampling_logp_difference/max": 0.6423366069793701, "sampling/sampling_logp_difference/mean": 0.012571008875966072, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 169.8125, "completions/mean_terminated_length": 169.8125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.22068004310131073, "epoch": 2.873774509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.03778412130877862, "kl": 0.05655568093061447, "learning_rate": 5.798943109461995e-09, "loss": 0.0006, "num_tokens": 73713639.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.464235544204712, "sampling/importance_sampling_ratio/mean": 0.9997299909591675, "sampling/importance_sampling_ratio/min": 0.6174396872520447, "sampling/sampling_logp_difference/max": 0.4821739196777344, "sampling/sampling_logp_difference/mean": 0.012931197881698608, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 197.65625, "completions/mean_terminated_length": 197.65625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2522512674331665, "epoch": 2.875, "frac_reward_zero_std": 0.75, "grad_norm": 1.4023857839598615, "kl": 0.09977763891220093, "learning_rate": 5.691263290738824e-09, "loss": 0.0637, "num_tokens": 73745889.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.550814151763916, "sampling/importance_sampling_ratio/mean": 1.0001001358032227, "sampling/importance_sampling_ratio/min": 0.5773372054100037, "sampling/sampling_logp_difference/max": 0.5493288040161133, "sampling/sampling_logp_difference/mean": 0.014501580968499184, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2255808264017105, "epoch": 2.876225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.3497166065174733, "kl": 0.05652454122900963, "learning_rate": 5.5845868874357385e-09, "loss": 0.002, "num_tokens": 73780145.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5456663370132446, "sampling/importance_sampling_ratio/mean": 1.0001392364501953, "sampling/importance_sampling_ratio/min": 0.40637242794036865, "sampling/sampling_logp_difference/max": 0.9004852771759033, "sampling/sampling_logp_difference/mean": 0.013296417891979218, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 257.921875, "completions/mean_terminated_length": 257.921875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2910671830177307, "epoch": 2.877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03939602125877812, "kl": 0.054026078432798386, "learning_rate": 5.4789141160991314e-09, "loss": 0.0005, "num_tokens": 73819852.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.637647032737732, "sampling/importance_sampling_ratio/mean": 1.000435709953308, "sampling/importance_sampling_ratio/min": 0.47571611404418945, "sampling/sampling_logp_difference/max": 0.742933988571167, "sampling/sampling_logp_difference/mean": 0.016137175261974335, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 169.71875, "completions/mean_terminated_length": 169.71875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24733196198940277, "epoch": 2.8786764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.5302278882351905, "kl": 0.14274589717388153, "learning_rate": 5.374245191238025e-09, "loss": -0.0273, "num_tokens": 73845690.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001418590545654, "sampling/importance_sampling_ratio/min": 0.6148183941841125, "sampling/sampling_logp_difference/max": 0.7016294002532959, "sampling/sampling_logp_difference/mean": 0.014793280512094498, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 204.578125, "completions/mean_terminated_length": 204.578125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.22955816984176636, "epoch": 2.8799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.8460039376966723, "kl": 0.08136755228042603, "learning_rate": 5.270580325323681e-09, "loss": -0.0066, "num_tokens": 73878335.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6463840007781982, "sampling/importance_sampling_ratio/mean": 0.9999974966049194, "sampling/importance_sampling_ratio/min": 0.5290601849555969, "sampling/sampling_logp_difference/max": 0.6366531252861023, "sampling/sampling_logp_difference/mean": 0.012156964279711246, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 228.90625, "completions/mean_terminated_length": 228.90625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2115429937839508, "epoch": 2.881127450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.9427606694568517, "kl": 0.06694163382053375, "learning_rate": 5.167919728789271e-09, "loss": -0.0038, "num_tokens": 73909689.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5651397705078125, "sampling/importance_sampling_ratio/mean": 0.9999818801879883, "sampling/importance_sampling_ratio/min": 0.598135769367218, "sampling/sampling_logp_difference/max": 0.5139374732971191, "sampling/sampling_logp_difference/mean": 0.01208187360316515, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 203.3125, "completions/mean_terminated_length": 203.3125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.17354030907154083, "epoch": 2.8823529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.995867825936497, "kl": 0.06583705544471741, "learning_rate": 5.0662636100292086e-09, "loss": 0.0132, "num_tokens": 73937341.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.9133031368255615, "sampling/importance_sampling_ratio/mean": 0.9998100399971008, "sampling/importance_sampling_ratio/min": 0.45969629287719727, "sampling/sampling_logp_difference/max": 0.7771892547607422, "sampling/sampling_logp_difference/mean": 0.01046935748308897, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 231.78125, "completions/mean_terminated_length": 231.78125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3074193596839905, "epoch": 2.883578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.014062199626538, "kl": 0.0723794549703598, "learning_rate": 4.965612175399092e-09, "loss": -0.0166, "num_tokens": 73976207.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.482747197151184, "sampling/importance_sampling_ratio/mean": 0.9993435740470886, "sampling/importance_sampling_ratio/min": 0.4954190254211426, "sampling/sampling_logp_difference/max": 0.7023513317108154, "sampling/sampling_logp_difference/mean": 0.016915656626224518, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 203.078125, "completions/mean_terminated_length": 203.078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.24749070405960083, "epoch": 2.8848039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.900352336707037, "kl": 0.08469018340110779, "learning_rate": 4.865965629214819e-09, "loss": 0.0537, "num_tokens": 74008564.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5668904781341553, "sampling/importance_sampling_ratio/mean": 0.9996317625045776, "sampling/importance_sampling_ratio/min": 0.5433550477027893, "sampling/sampling_logp_difference/max": 0.609992265701294, "sampling/sampling_logp_difference/mean": 0.014950131066143513, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 197.40625, "completions/mean_terminated_length": 197.40625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.21615588665008545, "epoch": 2.8860294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 1.3787796597011528, "kl": 0.07328522205352783, "learning_rate": 4.767324173752696e-09, "loss": -0.0242, "num_tokens": 74036430.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5979527235031128, "sampling/importance_sampling_ratio/mean": 0.9993993043899536, "sampling/importance_sampling_ratio/min": 0.5038020610809326, "sampling/sampling_logp_difference/max": 0.6855719089508057, "sampling/sampling_logp_difference/mean": 0.011895122937858105, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 168.34375, "completions/mean_terminated_length": 168.34375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.23322808742523193, "epoch": 2.8872549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.9803163233145722, "kl": 0.0867251604795456, "learning_rate": 4.669688009248607e-09, "loss": 0.0057, "num_tokens": 74067076.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004045963287354, "sampling/importance_sampling_ratio/min": 0.08583655208349228, "sampling/sampling_logp_difference/max": 2.455310344696045, "sampling/sampling_logp_difference/mean": 0.01472594402730465, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 191.03125, "completions/mean_terminated_length": 191.03125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.20626437664031982, "epoch": 2.888480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.0205510258812007, "kl": 0.08031073212623596, "learning_rate": 4.5730573338976786e-09, "loss": -0.0087, "num_tokens": 74094422.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000152349472046, "sampling/importance_sampling_ratio/min": 0.4937072992324829, "sampling/sampling_logp_difference/max": 1.0611766576766968, "sampling/sampling_logp_difference/mean": 0.013788919895887375, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 221.6875, "completions/mean_terminated_length": 221.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.22249484062194824, "epoch": 2.889705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 3.6222429895918276, "kl": 0.08763054013252258, "learning_rate": 4.477432343854226e-09, "loss": 0.0441, "num_tokens": 74131618.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8446756601333618, "sampling/importance_sampling_ratio/mean": 1.0002224445343018, "sampling/importance_sampling_ratio/min": 0.34300485253334045, "sampling/sampling_logp_difference/max": 1.0700106620788574, "sampling/sampling_logp_difference/mean": 0.013853983022272587, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 186.703125, "completions/mean_terminated_length": 186.703125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.18491169810295105, "epoch": 2.8909313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.256304694994609, "kl": 0.05769640952348709, "learning_rate": 4.382813233230698e-09, "loss": -0.0553, "num_tokens": 74160191.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6951230764389038, "sampling/importance_sampling_ratio/mean": 1.0000591278076172, "sampling/importance_sampling_ratio/min": 0.6097238659858704, "sampling/sampling_logp_difference/max": 0.5277553796768188, "sampling/sampling_logp_difference/mean": 0.011838407255709171, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 166.140625, "completions/mean_terminated_length": 166.140625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.20564104616641998, "epoch": 2.892156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 2.859432394324812, "kl": 0.08301499485969543, "learning_rate": 4.289200194098119e-09, "loss": 0.05, "num_tokens": 74190792.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.969706654548645, "sampling/importance_sampling_ratio/mean": 0.999295711517334, "sampling/importance_sampling_ratio/min": 0.47804099321365356, "sampling/sampling_logp_difference/max": 0.7380588054656982, "sampling/sampling_logp_difference/mean": 0.013555385172367096, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 188.640625, "completions/mean_terminated_length": 188.640625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3259560465812683, "epoch": 2.8933823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 1.2541628273128105, "kl": 0.13829918205738068, "learning_rate": 4.196593416484873e-09, "loss": -0.0037, "num_tokens": 74219441.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003265142440796, "sampling/importance_sampling_ratio/min": 0.6181949973106384, "sampling/sampling_logp_difference/max": 0.7279109954833984, "sampling/sampling_logp_difference/mean": 0.01699439249932766, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 181.453125, "completions/mean_terminated_length": 181.453125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.25785890221595764, "epoch": 2.894607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.5208642846812594, "kl": 0.08224321901798248, "learning_rate": 4.104993088376974e-09, "loss": 0.0062, "num_tokens": 74246238.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9391735792160034, "sampling/importance_sampling_ratio/mean": 1.000069499015808, "sampling/importance_sampling_ratio/min": 0.6216408014297485, "sampling/sampling_logp_difference/max": 0.662261962890625, "sampling/sampling_logp_difference/mean": 0.015074081718921661, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 198.265625, "completions/mean_terminated_length": 198.265625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2786763310432434, "epoch": 2.8958333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.678535910803952, "kl": 0.08939115703105927, "learning_rate": 4.0143993957171826e-09, "loss": -0.0006, "num_tokens": 74283679.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.9539135694503784, "sampling/importance_sampling_ratio/mean": 0.9996293783187866, "sampling/importance_sampling_ratio/min": 0.42159610986709595, "sampling/sampling_logp_difference/max": 0.8637075424194336, "sampling/sampling_logp_difference/mean": 0.016311483457684517, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 196.984375, "completions/mean_terminated_length": 196.984375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.17440006136894226, "epoch": 2.8970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.06420097834190534, "kl": 0.05318032205104828, "learning_rate": 3.924812522404952e-09, "loss": 0.0005, "num_tokens": 74318910.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6251327991485596, "sampling/importance_sampling_ratio/mean": 0.9998441934585571, "sampling/importance_sampling_ratio/min": 0.32394373416900635, "sampling/sampling_logp_difference/max": 1.1271854639053345, "sampling/sampling_logp_difference/mean": 0.012006749399006367, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 275.953125, "completions/mean_terminated_length": 275.953125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2703547477722168, "epoch": 2.8982843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 1.3364025004222901, "kl": 0.06834714859724045, "learning_rate": 3.836232650296034e-09, "loss": 0.0206, "num_tokens": 74355803.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6307568550109863, "sampling/importance_sampling_ratio/mean": 1.0003077983856201, "sampling/importance_sampling_ratio/min": 0.4889255464076996, "sampling/sampling_logp_difference/max": 0.7155450582504272, "sampling/sampling_logp_difference/mean": 0.01356726884841919, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 194.21875, "completions/mean_terminated_length": 194.21875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2951743006706238, "epoch": 2.8995098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.151388101507571, "kl": 0.13011303544044495, "learning_rate": 3.748659959201928e-09, "loss": -0.0035, "num_tokens": 74385001.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999871015548706, "sampling/importance_sampling_ratio/min": 0.5403658747673035, "sampling/sampling_logp_difference/max": 0.7130794525146484, "sampling/sampling_logp_difference/mean": 0.016106517985463142, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1530219316482544, "epoch": 2.900735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.344333949614905, "kl": 0.07376052439212799, "learning_rate": 3.6620946268896556e-09, "loss": -0.0071, "num_tokens": 74410729.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6253750324249268, "sampling/importance_sampling_ratio/mean": 0.9999873042106628, "sampling/importance_sampling_ratio/min": 0.4945722818374634, "sampling/sampling_logp_difference/max": 0.7040619850158691, "sampling/sampling_logp_difference/mean": 0.010870207101106644, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 205.03125, "completions/mean_terminated_length": 205.03125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.21062304079532623, "epoch": 2.9019607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.6114536958132395, "kl": 0.08530819416046143, "learning_rate": 3.5765368290813223e-09, "loss": -0.0212, "num_tokens": 74442875.0, "reward": 0.625, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5103265047073364, "sampling/importance_sampling_ratio/mean": 1.0002667903900146, "sampling/importance_sampling_ratio/min": 0.6178181171417236, "sampling/sampling_logp_difference/max": 0.48156118392944336, "sampling/sampling_logp_difference/mean": 0.012324854731559753, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 182.75, "completions/mean_terminated_length": 182.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.21767497062683105, "epoch": 2.903186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.1853996194935956, "kl": 0.07698185741901398, "learning_rate": 3.491986739453889e-09, "loss": 0.017, "num_tokens": 74474731.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.7626488208770752, "sampling/importance_sampling_ratio/mean": 1.0002050399780273, "sampling/importance_sampling_ratio/min": 0.5483866930007935, "sampling/sampling_logp_difference/max": 0.6007745265960693, "sampling/sampling_logp_difference/mean": 0.013286888599395752, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 150.3125, "completions/mean_terminated_length": 150.3125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.18176418542861938, "epoch": 2.9044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.037758449916620904, "kl": 0.05915432050824165, "learning_rate": 3.4084445296386767e-09, "loss": 0.0006, "num_tokens": 74504671.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8179765939712524, "sampling/importance_sampling_ratio/mean": 1.0005993843078613, "sampling/importance_sampling_ratio/min": 0.29257166385650635, "sampling/sampling_logp_difference/max": 1.2290456295013428, "sampling/sampling_logp_difference/mean": 0.013469929806888103, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 158.1875, "completions/mean_terminated_length": 158.1875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2425750195980072, "epoch": 2.905637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.0827567736911625, "kl": 0.08463536202907562, "learning_rate": 3.3259103692209745e-09, "loss": -0.0042, "num_tokens": 74532155.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6177663803100586, "sampling/importance_sampling_ratio/mean": 0.998638927936554, "sampling/importance_sampling_ratio/min": 0.5489455461502075, "sampling/sampling_logp_difference/max": 0.5997560024261475, "sampling/sampling_logp_difference/mean": 0.015707600861787796, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 167.765625, "completions/mean_terminated_length": 167.765625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.24882417917251587, "epoch": 2.906862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.3194867085940902, "kl": 0.08847443759441376, "learning_rate": 3.2443844257400434e-09, "loss": 0.0056, "num_tokens": 74566652.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8555169105529785, "sampling/importance_sampling_ratio/mean": 1.000077247619629, "sampling/importance_sampling_ratio/min": 0.3558327257633209, "sampling/sampling_logp_difference/max": 1.0332945585250854, "sampling/sampling_logp_difference/mean": 0.015010975301265717, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 217.015625, "completions/mean_terminated_length": 217.015625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.28991562128067017, "epoch": 2.9080882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.7664279811921024, "kl": 0.11535454541444778, "learning_rate": 3.163866864688336e-09, "loss": -0.0425, "num_tokens": 74599309.0, "reward": -0.125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6979293823242188, "sampling/importance_sampling_ratio/mean": 1.0001277923583984, "sampling/importance_sampling_ratio/min": 0.5260913968086243, "sampling/sampling_logp_difference/max": 0.6422803401947021, "sampling/sampling_logp_difference/mean": 0.014657038263976574, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.26413846015930176, "epoch": 2.909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.16315822509215297, "kl": 0.0894489660859108, "learning_rate": 3.0843578495113877e-09, "loss": 0.0009, "num_tokens": 74629101.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7033652067184448, "sampling/importance_sampling_ratio/mean": 1.00035560131073, "sampling/importance_sampling_ratio/min": 0.4520955979824066, "sampling/sampling_logp_difference/max": 0.7938616275787354, "sampling/sampling_logp_difference/mean": 0.015666788443922997, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 189.234375, "completions/mean_terminated_length": 189.234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.26266130805015564, "epoch": 2.9105392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 2.2143216733494686, "kl": 0.12630251049995422, "learning_rate": 3.0058575416073707e-09, "loss": -0.0105, "num_tokens": 74657324.0, "reward": 0.34375, "reward_std": 0.7297805547714233, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5772355794906616, "sampling/importance_sampling_ratio/mean": 0.9998902082443237, "sampling/importance_sampling_ratio/min": 0.3374631702899933, "sampling/sampling_logp_difference/max": 1.086298942565918, "sampling/sampling_logp_difference/mean": 0.016290338709950447, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 143.703125, "completions/mean_terminated_length": 143.703125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.22800227999687195, "epoch": 2.911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.09451691999504142, "kl": 0.09714032709598541, "learning_rate": 2.9283661003270952e-09, "loss": 0.001, "num_tokens": 74684649.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7779202461242676, "sampling/importance_sampling_ratio/mean": 1.0003443956375122, "sampling/importance_sampling_ratio/min": 0.49009910225868225, "sampling/sampling_logp_difference/max": 0.7131476402282715, "sampling/sampling_logp_difference/mean": 0.013768637552857399, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 208.203125, "completions/mean_terminated_length": 208.203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2165367305278778, "epoch": 2.9129901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.06683402199550116, "kl": 0.08643889427185059, "learning_rate": 2.851883682973233e-09, "loss": 0.0008, "num_tokens": 74718646.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.647499680519104, "sampling/importance_sampling_ratio/mean": 1.0002728700637817, "sampling/importance_sampling_ratio/min": 0.2931484580039978, "sampling/sampling_logp_difference/max": 1.2270760536193848, "sampling/sampling_logp_difference/mean": 0.012979322113096714, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 181.84375, "completions/mean_terminated_length": 181.84375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.22348137199878693, "epoch": 2.9142156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.06213794220960048, "kl": 0.060592323541641235, "learning_rate": 2.776410444800148e-09, "loss": 0.0006, "num_tokens": 74750524.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7699233293533325, "sampling/importance_sampling_ratio/mean": 0.9999657869338989, "sampling/importance_sampling_ratio/min": 0.4934474527835846, "sampling/sampling_logp_difference/max": 0.7063388824462891, "sampling/sampling_logp_difference/mean": 0.013729427009820938, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 188.640625, "completions/mean_terminated_length": 188.640625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.23032495379447937, "epoch": 2.9154411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.1745762357725924, "kl": 0.07656420767307281, "learning_rate": 2.701946539013844e-09, "loss": -0.0202, "num_tokens": 74780053.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.9472484588623047, "sampling/importance_sampling_ratio/mean": 1.0007221698760986, "sampling/importance_sampling_ratio/min": 0.5678617358207703, "sampling/sampling_logp_difference/max": 0.6664173603057861, "sampling/sampling_logp_difference/mean": 0.013936182484030724, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 142.609375, "completions/mean_terminated_length": 142.609375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2128639966249466, "epoch": 2.9166666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 1.4505295862517897, "kl": 0.07548115402460098, "learning_rate": 2.628492116771297e-09, "loss": 0.0028, "num_tokens": 74807724.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6647119522094727, "sampling/importance_sampling_ratio/mean": 1.0005338191986084, "sampling/importance_sampling_ratio/min": 0.6141372323036194, "sampling/sampling_logp_difference/max": 0.5096521377563477, "sampling/sampling_logp_difference/mean": 0.01334769744426012, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 194.1875, "completions/mean_terminated_length": 194.1875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3141852021217346, "epoch": 2.917892156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.8320177654093657, "kl": 0.1052158772945404, "learning_rate": 2.556047327180344e-09, "loss": -0.0205, "num_tokens": 74836168.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.623957633972168, "sampling/importance_sampling_ratio/mean": 0.9996415376663208, "sampling/importance_sampling_ratio/min": 0.3899306058883667, "sampling/sampling_logp_difference/max": 0.941786527633667, "sampling/sampling_logp_difference/mean": 0.017161313444375992, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 180.21875, "completions/mean_terminated_length": 180.21875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.256213903427124, "epoch": 2.9191176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.1278551278739337, "kl": 0.096051424741745, "learning_rate": 2.484612317299295e-09, "loss": -0.0088, "num_tokens": 74865574.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7726800441741943, "sampling/importance_sampling_ratio/mean": 0.9998409748077393, "sampling/importance_sampling_ratio/min": 0.5763106942176819, "sampling/sampling_logp_difference/max": 0.5724925994873047, "sampling/sampling_logp_difference/mean": 0.016871415078639984, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 236.421875, "completions/mean_terminated_length": 236.421875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2461642324924469, "epoch": 2.920343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.1292133916352909, "kl": 0.1166759729385376, "learning_rate": 2.4141872321367107e-09, "loss": 0.0052, "num_tokens": 74896753.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997732639312744, "sampling/importance_sampling_ratio/min": 0.4644794166088104, "sampling/sampling_logp_difference/max": 0.7668380737304688, "sampling/sampling_logp_difference/mean": 0.014024095609784126, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 161.9375, "completions/mean_terminated_length": 161.9375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.22293847799301147, "epoch": 2.9215686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.07028081816171995, "kl": 0.07327209413051605, "learning_rate": 2.344772214651014e-09, "loss": 0.0007, "num_tokens": 74930109.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7980595827102661, "sampling/importance_sampling_ratio/mean": 1.000113844871521, "sampling/importance_sampling_ratio/min": 0.44718384742736816, "sampling/sampling_logp_difference/max": 0.8047854900360107, "sampling/sampling_logp_difference/mean": 0.01504216343164444, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 228.828125, "completions/mean_terminated_length": 228.828125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.18402422964572906, "epoch": 2.922794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.1222707178536586, "kl": 0.04598855972290039, "learning_rate": 2.2763674057503235e-09, "loss": -0.0076, "num_tokens": 74968994.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994775056838989, "sampling/importance_sampling_ratio/min": 0.49969446659088135, "sampling/sampling_logp_difference/max": 0.8864250183105469, "sampling/sampling_logp_difference/mean": 0.012235797941684723, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.2333841621875763, "epoch": 2.924019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.09732410717908493, "kl": 0.09015602618455887, "learning_rate": 2.20897294429212e-09, "loss": 0.001, "num_tokens": 74997018.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7521053552627563, "sampling/importance_sampling_ratio/mean": 0.9996908903121948, "sampling/importance_sampling_ratio/min": 0.65540611743927, "sampling/sampling_logp_difference/max": 0.560818076133728, "sampling/sampling_logp_difference/mean": 0.013704277575016022, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 190.3125, "completions/mean_terminated_length": 190.3125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.25898271799087524, "epoch": 2.9252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.09993229418286245, "kl": 0.08073795586824417, "learning_rate": 2.142588967082748e-09, "loss": 0.0009, "num_tokens": 75029070.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9561160802841187, "sampling/importance_sampling_ratio/mean": 1.0003554821014404, "sampling/importance_sampling_ratio/min": 0.5336609482765198, "sampling/sampling_logp_difference/max": 0.6709609031677246, "sampling/sampling_logp_difference/mean": 0.014637565240263939, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 227.71875, "completions/mean_terminated_length": 227.71875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2954796850681305, "epoch": 2.9264705882352944, "frac_reward_zero_std": 0.25, "grad_norm": 2.1971779506717732, "kl": 0.10312424600124359, "learning_rate": 2.0772156088776913e-09, "loss": -0.0036, "num_tokens": 75059932.0, "reward": 0.3125, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6147301197052002, "sampling/importance_sampling_ratio/mean": 1.000575065612793, "sampling/importance_sampling_ratio/min": 0.4161491394042969, "sampling/sampling_logp_difference/max": 0.8767116069793701, "sampling/sampling_logp_difference/mean": 0.01552021037787199, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 167.703125, "completions/mean_terminated_length": 167.703125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3421614170074463, "epoch": 2.9276960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.370010336418139, "kl": 0.10211165994405746, "learning_rate": 2.0128530023804656e-09, "loss": 0.0125, "num_tokens": 75090329.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5467345714569092, "sampling/importance_sampling_ratio/mean": 1.0004173517227173, "sampling/importance_sampling_ratio/min": 0.5122334361076355, "sampling/sampling_logp_difference/max": 0.6689748764038086, "sampling/sampling_logp_difference/mean": 0.01810724101960659, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 276.734375, "completions/mean_terminated_length": 276.734375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.24409303069114685, "epoch": 2.928921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.366713240450411, "kl": 0.0857454314827919, "learning_rate": 1.9495012782433375e-09, "loss": -0.0039, "num_tokens": 75130648.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.8357044458389282, "sampling/importance_sampling_ratio/mean": 1.0000190734863281, "sampling/importance_sampling_ratio/min": 0.526208758354187, "sampling/sampling_logp_difference/max": 0.6420572996139526, "sampling/sampling_logp_difference/mean": 0.014364847913384438, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 176.765625, "completions/mean_terminated_length": 176.765625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.19778773188591003, "epoch": 2.9301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.053425809659322615, "kl": 0.08036743104457855, "learning_rate": 1.887160565066048e-09, "loss": 0.0007, "num_tokens": 75159241.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.959877610206604, "sampling/importance_sampling_ratio/mean": 1.000080943107605, "sampling/importance_sampling_ratio/min": 0.1343042552471161, "sampling/sampling_logp_difference/max": 2.0076475143432617, "sampling/sampling_logp_difference/mean": 0.013936562463641167, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 254.890625, "completions/mean_terminated_length": 254.890625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.16103404760360718, "epoch": 2.931372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0610412820395003, "kl": 0.06309305131435394, "learning_rate": 1.8258309893965374e-09, "loss": 0.0006, "num_tokens": 75197602.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.755321741104126, "sampling/importance_sampling_ratio/mean": 0.9997216463088989, "sampling/importance_sampling_ratio/min": 0.6123639941215515, "sampling/sampling_logp_difference/max": 0.5626522302627563, "sampling/sampling_logp_difference/mean": 0.01104649156332016, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 155.265625, "completions/mean_terminated_length": 155.265625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2115069031715393, "epoch": 2.9325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.04934542419140079, "kl": 0.07161717116832733, "learning_rate": 1.7655126757297744e-09, "loss": 0.0007, "num_tokens": 75226099.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994068145751953, "sampling/importance_sampling_ratio/min": 0.475801557302475, "sampling/sampling_logp_difference/max": 0.8327982425689697, "sampling/sampling_logp_difference/mean": 0.013303788378834724, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 146.09375, "completions/mean_terminated_length": 146.09375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.23196184635162354, "epoch": 2.9338235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.6438645232529383, "kl": 0.0847909152507782, "learning_rate": 1.7062057465082046e-09, "loss": -0.0107, "num_tokens": 75253577.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5488466024398804, "sampling/importance_sampling_ratio/mean": 0.9996697306632996, "sampling/importance_sampling_ratio/min": 0.5592991709709167, "sampling/sampling_logp_difference/max": 0.5810707807540894, "sampling/sampling_logp_difference/mean": 0.014680958352982998, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 158.5625, "completions/mean_terminated_length": 158.5625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.24769297242164612, "epoch": 2.935049019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.2540197987817803, "kl": 0.09263478219509125, "learning_rate": 1.6479103221211377e-09, "loss": 0.0056, "num_tokens": 75283101.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000531673431396, "sampling/importance_sampling_ratio/min": 0.6368657350540161, "sampling/sampling_logp_difference/max": 0.8148508071899414, "sampling/sampling_logp_difference/mean": 0.01475741621106863, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 230.015625, "completions/mean_terminated_length": 230.015625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.27384263277053833, "epoch": 2.936274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.042707395194060215, "kl": 0.07987666875123978, "learning_rate": 1.5906265209045254e-09, "loss": 0.0008, "num_tokens": 75315470.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999940991401672, "sampling/importance_sampling_ratio/min": 0.5005773305892944, "sampling/sampling_logp_difference/max": 1.0181095600128174, "sampling/sampling_logp_difference/mean": 0.0167281124740839, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 222.9375, "completions/mean_terminated_length": 222.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.19160006940364838, "epoch": 2.9375, "frac_reward_zero_std": 0.5, "grad_norm": 1.4533464685523405, "kl": 0.059094592928886414, "learning_rate": 1.534354459140963e-09, "loss": 0.0399, "num_tokens": 75343930.0, "reward": 0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6478482484817505, "sampling/importance_sampling_ratio/mean": 0.9995725154876709, "sampling/importance_sampling_ratio/min": 0.48819631338119507, "sampling/sampling_logp_difference/max": 0.7170376777648926, "sampling/sampling_logp_difference/mean": 0.010771173983812332, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.1975494623184204, "epoch": 2.938725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.03824227447430867, "kl": 0.06701640039682388, "learning_rate": 1.4790942510590766e-09, "loss": 0.0007, "num_tokens": 75371606.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5930945873260498, "sampling/importance_sampling_ratio/mean": 0.9998019933700562, "sampling/importance_sampling_ratio/min": 0.6237468719482422, "sampling/sampling_logp_difference/max": 0.47201061248779297, "sampling/sampling_logp_difference/mean": 0.012317480519413948, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 152.203125, "completions/mean_terminated_length": 152.203125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.20233231782913208, "epoch": 2.939950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.3729668722016508, "kl": 0.07118867337703705, "learning_rate": 1.4248460088335801e-09, "loss": -0.0233, "num_tokens": 75397795.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006823539733887, "sampling/importance_sampling_ratio/min": 0.3063874840736389, "sampling/sampling_logp_difference/max": 1.1829047203063965, "sampling/sampling_logp_difference/mean": 0.013798831030726433, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 248.015625, "completions/mean_terminated_length": 248.015625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2584313452243805, "epoch": 2.9411764705882355, "frac_reward_zero_std": 0.25, "grad_norm": 1.697534943531724, "kl": 0.07394785434007645, "learning_rate": 1.371609842585053e-09, "loss": 0.028, "num_tokens": 75432196.0, "reward": 0.8125, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5889581441879272, "sampling/importance_sampling_ratio/mean": 1.0002371072769165, "sampling/importance_sampling_ratio/min": 0.5168574452400208, "sampling/sampling_logp_difference/max": 0.6599881649017334, "sampling/sampling_logp_difference/mean": 0.013856764882802963, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 189.234375, "completions/mean_terminated_length": 189.234375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3257656991481781, "epoch": 2.9424019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.10361650664180576, "kl": 0.12545722723007202, "learning_rate": 1.319385860379496e-09, "loss": 0.0012, "num_tokens": 75466611.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996939897537231, "sampling/importance_sampling_ratio/min": 0.4234618544578552, "sampling/sampling_logp_difference/max": 0.9860789775848389, "sampling/sampling_logp_difference/mean": 0.018942903727293015, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 207.171875, "completions/mean_terminated_length": 207.171875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2865198850631714, "epoch": 2.943627450980392, "frac_reward_zero_std": 0.25, "grad_norm": 1.9337615033533684, "kl": 0.09686718881130219, "learning_rate": 1.2681741682282754e-09, "loss": -0.0054, "num_tokens": 75494510.0, "reward": 0.5, "reward_std": 0.5651718378067017, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7766722440719604, "sampling/importance_sampling_ratio/mean": 0.9999604225158691, "sampling/importance_sampling_ratio/min": 0.6210312247276306, "sampling/sampling_logp_difference/max": 0.5747420787811279, "sampling/sampling_logp_difference/mean": 0.014227193780243397, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 229.640625, "completions/mean_terminated_length": 229.640625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.23316679894924164, "epoch": 2.9448529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.07515850973576715, "kl": 0.07214280962944031, "learning_rate": 1.217974870087901e-09, "loss": 0.0007, "num_tokens": 75526903.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6321943998336792, "sampling/importance_sampling_ratio/mean": 1.000157356262207, "sampling/importance_sampling_ratio/min": 0.5540791749954224, "sampling/sampling_logp_difference/max": 0.5904476642608643, "sampling/sampling_logp_difference/mean": 0.013380090706050396, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 171.890625, "completions/mean_terminated_length": 171.890625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.33587875962257385, "epoch": 2.946078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.3763860182258667, "kl": 0.13876089453697205, "learning_rate": 1.1687880678596939e-09, "loss": 0.0202, "num_tokens": 75563424.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.539456844329834, "sampling/importance_sampling_ratio/mean": 1.0002048015594482, "sampling/importance_sampling_ratio/min": 0.524482011795044, "sampling/sampling_logp_difference/max": 0.6453441381454468, "sampling/sampling_logp_difference/mean": 0.018524974584579468, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 160.296875, "completions/mean_terminated_length": 160.296875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.23508617281913757, "epoch": 2.9473039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.6674638252877323, "kl": 0.08367919921875, "learning_rate": 1.1206138613898962e-09, "loss": -0.0234, "num_tokens": 75588867.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4926220178604126, "sampling/importance_sampling_ratio/mean": 1.0001332759857178, "sampling/importance_sampling_ratio/min": 0.44747617840766907, "sampling/sampling_logp_difference/max": 0.8041319847106934, "sampling/sampling_logp_difference/mean": 0.014118922874331474, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 182.671875, "completions/mean_terminated_length": 182.671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.16823163628578186, "epoch": 2.9485294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.04092480094650376, "kl": 0.056618332862854004, "learning_rate": 1.0734523484689507e-09, "loss": 0.0005, "num_tokens": 75620318.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999382495880127, "sampling/importance_sampling_ratio/min": 0.45940688252449036, "sampling/sampling_logp_difference/max": 0.8144283294677734, "sampling/sampling_logp_difference/mean": 0.011973707936704159, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 197.796875, "completions/mean_terminated_length": 197.796875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2799832224845886, "epoch": 2.9497549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.0930479009365046, "kl": 0.09062144160270691, "learning_rate": 1.0273036248318324e-09, "loss": 0.0163, "num_tokens": 75650945.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001051425933838, "sampling/importance_sampling_ratio/min": 0.6147287487983704, "sampling/sampling_logp_difference/max": 0.766869068145752, "sampling/sampling_logp_difference/mean": 0.013567798770964146, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 204.15625, "completions/mean_terminated_length": 204.15625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.25723665952682495, "epoch": 2.950980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.4889937073059785, "kl": 0.09684564918279648, "learning_rate": 9.82167784157495e-10, "loss": -0.0005, "num_tokens": 75679355.0, "reward": 0.34375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.62906014919281, "sampling/importance_sampling_ratio/mean": 0.9999738335609436, "sampling/importance_sampling_ratio/min": 0.6158067584037781, "sampling/sampling_logp_difference/max": 0.4880032539367676, "sampling/sampling_logp_difference/mean": 0.014383267611265182, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 193.65625, "completions/mean_terminated_length": 193.65625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.252433717250824, "epoch": 2.952205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 2.963388265256947, "kl": 0.10595382750034332, "learning_rate": 9.380449180688143e-10, "loss": -0.0252, "num_tokens": 75708565.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998771548271179, "sampling/importance_sampling_ratio/min": 0.36926916241645813, "sampling/sampling_logp_difference/max": 0.9962295293807983, "sampling/sampling_logp_difference/mean": 0.01537374872714281, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 241.625, "completions/mean_terminated_length": 241.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.21027690172195435, "epoch": 2.9534313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.03188025150667092, "kl": 0.04365028068423271, "learning_rate": 8.949351161324225e-10, "loss": 0.0004, "num_tokens": 75745885.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998771548271179, "sampling/importance_sampling_ratio/min": 0.4754011332988739, "sampling/sampling_logp_difference/max": 0.7435963153839111, "sampling/sampling_logp_difference/mean": 0.012878429144620895, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 144.09375, "completions/mean_terminated_length": 144.09375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.191091388463974, "epoch": 2.954656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.3793387235702022, "kl": 0.12095391750335693, "learning_rate": 8.528384658584853e-10, "loss": 0.0054, "num_tokens": 75769843.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9066637754440308, "sampling/importance_sampling_ratio/mean": 0.9999489784240723, "sampling/importance_sampling_ratio/min": 0.4309251308441162, "sampling/sampling_logp_difference/max": 0.8418209552764893, "sampling/sampling_logp_difference/mean": 0.012651410885155201, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 218.53125, "completions/mean_terminated_length": 218.53125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.23037388920783997, "epoch": 2.9558823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.9820654691587366, "kl": 0.05584049969911575, "learning_rate": 8.117550527005912e-10, "loss": 0.0105, "num_tokens": 75799893.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000017762184143, "sampling/importance_sampling_ratio/min": 0.626641035079956, "sampling/sampling_logp_difference/max": 0.7562308311462402, "sampling/sampling_logp_difference/mean": 0.012850666418671608, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 145.515625, "completions/mean_terminated_length": 145.515625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.272765189409256, "epoch": 2.957107843137255, "frac_reward_zero_std": 0.5, "grad_norm": 2.2109855999852184, "kl": 0.11711627244949341, "learning_rate": 7.716849600554188e-10, "loss": 0.013, "num_tokens": 75826294.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.9099805355072021, "sampling/importance_sampling_ratio/mean": 0.9996508359909058, "sampling/importance_sampling_ratio/min": 0.544747531414032, "sampling/sampling_logp_difference/max": 0.6470930576324463, "sampling/sampling_logp_difference/mean": 0.016956236213445663, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 183.078125, "completions/mean_terminated_length": 183.078125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.20906969904899597, "epoch": 2.9583333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.3978070372573852, "kl": 0.08074396848678589, "learning_rate": 7.326282692626806e-10, "loss": 0.0072, "num_tokens": 75852347.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5071555376052856, "sampling/importance_sampling_ratio/mean": 0.999751091003418, "sampling/importance_sampling_ratio/min": 0.5765681266784668, "sampling/sampling_logp_difference/max": 0.5506618022918701, "sampling/sampling_logp_difference/mean": 0.012953568249940872, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 246.515625, "completions/mean_terminated_length": 246.515625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.31527090072631836, "epoch": 2.9595588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.638447493144575, "kl": 0.0897536426782608, "learning_rate": 6.945850596050684e-10, "loss": 0.0045, "num_tokens": 75884444.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.650807499885559, "sampling/importance_sampling_ratio/mean": 0.9997353553771973, "sampling/importance_sampling_ratio/min": 0.5791006684303284, "sampling/sampling_logp_difference/max": 0.5462789535522461, "sampling/sampling_logp_difference/mean": 0.015875663608312607, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 161.40625, "completions/mean_terminated_length": 161.40625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2676832973957062, "epoch": 2.9607843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 1.7306499516456963, "kl": 0.08243481814861298, "learning_rate": 6.575554083078083e-10, "loss": 0.0248, "num_tokens": 75911254.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.697697401046753, "sampling/importance_sampling_ratio/mean": 0.9998440146446228, "sampling/importance_sampling_ratio/min": 0.6348111629486084, "sampling/sampling_logp_difference/max": 0.5292727947235107, "sampling/sampling_logp_difference/mean": 0.015132778324186802, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 182.859375, "completions/mean_terminated_length": 182.859375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1757541000843048, "epoch": 2.9620098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 2.387450335820891, "kl": 0.0584825724363327, "learning_rate": 6.215393905388278e-10, "loss": 0.2234, "num_tokens": 75939133.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6519036293029785, "sampling/importance_sampling_ratio/mean": 1.0005888938903809, "sampling/importance_sampling_ratio/min": 0.5140793323516846, "sampling/sampling_logp_difference/max": 0.6653776168823242, "sampling/sampling_logp_difference/mean": 0.011330155655741692, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 186.46875, "completions/mean_terminated_length": 186.46875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.2033308893442154, "epoch": 2.963235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.6699127491801236, "kl": 0.07381125539541245, "learning_rate": 5.865370794082558e-10, "loss": -0.001, "num_tokens": 75969115.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 0.9995757937431335, "sampling/importance_sampling_ratio/min": 0.6202810406684875, "sampling/sampling_logp_difference/max": 0.4775826930999756, "sampling/sampling_logp_difference/mean": 0.012664059177041054, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 178.640625, "completions/mean_terminated_length": 178.640625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2655785381793976, "epoch": 2.9644607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.9247218081431783, "kl": 0.1148318350315094, "learning_rate": 5.525485459687007e-10, "loss": 0.0007, "num_tokens": 75997972.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999703049659729, "sampling/importance_sampling_ratio/min": 0.5523868203163147, "sampling/sampling_logp_difference/max": 0.8148813247680664, "sampling/sampling_logp_difference/mean": 0.014968165196478367, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 198.1875, "completions/mean_terminated_length": 198.1875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2387477457523346, "epoch": 2.965686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.9179642676561003, "kl": 0.0756097286939621, "learning_rate": 5.195738592145838e-10, "loss": -0.0054, "num_tokens": 76040528.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6207278966903687, "sampling/importance_sampling_ratio/mean": 0.9994602799415588, "sampling/importance_sampling_ratio/min": 0.6100690960884094, "sampling/sampling_logp_difference/max": 0.4941830635070801, "sampling/sampling_logp_difference/mean": 0.015103841200470924, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 229.015625, "completions/mean_terminated_length": 229.015625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2549667954444885, "epoch": 2.9669117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.03421188773261006, "kl": 0.05629030987620354, "learning_rate": 4.876130860825278e-10, "loss": 0.0006, "num_tokens": 76076129.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6196949481964111, "sampling/importance_sampling_ratio/mean": 1.0002193450927734, "sampling/importance_sampling_ratio/min": 0.5888904333114624, "sampling/sampling_logp_difference/max": 0.5295150876045227, "sampling/sampling_logp_difference/mean": 0.01382712833583355, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 177.78125, "completions/mean_terminated_length": 177.78125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.19116365909576416, "epoch": 2.968137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.048813274652691925, "kl": 0.04953814297914505, "learning_rate": 4.566662914508579e-10, "loss": 0.0005, "num_tokens": 76104707.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9391807317733765, "sampling/importance_sampling_ratio/mean": 0.9995027184486389, "sampling/importance_sampling_ratio/min": 0.47432684898376465, "sampling/sampling_logp_difference/max": 0.7458586692810059, "sampling/sampling_logp_difference/mean": 0.012865998782217503, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 244.421875, "completions/mean_terminated_length": 244.421875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.20995011925697327, "epoch": 2.969362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.5809898301512049, "kl": 0.05042857304215431, "learning_rate": 4.267335381396564e-10, "loss": 0.0375, "num_tokens": 76144974.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5133386850357056, "sampling/importance_sampling_ratio/mean": 1.0000345706939697, "sampling/importance_sampling_ratio/min": 0.47389116883277893, "sampling/sampling_logp_difference/max": 0.7467775344848633, "sampling/sampling_logp_difference/mean": 0.010932646691799164, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 206.65625, "completions/mean_terminated_length": 206.65625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.26046937704086304, "epoch": 2.9705882352941178, "frac_reward_zero_std": 0.25, "grad_norm": 2.0295393323799478, "kl": 0.1752389371395111, "learning_rate": 3.978148869103748e-10, "loss": -0.0007, "num_tokens": 76176808.0, "reward": 0.71875, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7038747072219849, "sampling/importance_sampling_ratio/mean": 1.00004243850708, "sampling/importance_sampling_ratio/min": 0.5678154826164246, "sampling/sampling_logp_difference/max": 0.5659587383270264, "sampling/sampling_logp_difference/mean": 0.014540375210344791, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 189.234375, "completions/mean_terminated_length": 189.234375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2247229814529419, "epoch": 2.971813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.03381290248002876, "kl": 0.056775644421577454, "learning_rate": 3.699103964661665e-10, "loss": 0.0005, "num_tokens": 76220871.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8061577081680298, "sampling/importance_sampling_ratio/mean": 0.9999212622642517, "sampling/importance_sampling_ratio/min": 0.5134853720664978, "sampling/sampling_logp_difference/max": 0.6665337085723877, "sampling/sampling_logp_difference/mean": 0.016165809705853462, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 159.546875, "completions/mean_terminated_length": 159.546875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.24071826040744781, "epoch": 2.9730392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.4308604112635903, "kl": 0.10873550176620483, "learning_rate": 3.430201234513874e-10, "loss": 0.0058, "num_tokens": 76244666.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998641014099121, "sampling/importance_sampling_ratio/min": 0.5392753481864929, "sampling/sampling_logp_difference/max": 0.945838212966919, "sampling/sampling_logp_difference/mean": 0.015373199246823788, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 187.1875, "completions/mean_terminated_length": 187.1875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2494470477104187, "epoch": 2.974264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.045471425782333735, "kl": 0.08607512712478638, "learning_rate": 3.171441224514848e-10, "loss": 0.0008, "num_tokens": 76275046.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5827796459197998, "sampling/importance_sampling_ratio/mean": 0.9993318915367126, "sampling/importance_sampling_ratio/min": 0.5341923236846924, "sampling/sampling_logp_difference/max": 0.6269993782043457, "sampling/sampling_logp_difference/mean": 0.014485558494925499, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 179.765625, "completions/mean_terminated_length": 179.765625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2240767478942871, "epoch": 2.9754901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.5617583684394614, "kl": 0.10678014159202576, "learning_rate": 2.922824459931639e-10, "loss": 0.057, "num_tokens": 76307127.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999181032180786, "sampling/importance_sampling_ratio/min": 0.5355904698371887, "sampling/sampling_logp_difference/max": 0.9863035678863525, "sampling/sampling_logp_difference/mean": 0.013244271278381348, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 198.609375, "completions/mean_terminated_length": 198.609375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.29143062233924866, "epoch": 2.9767156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.470008467109167, "kl": 0.08310844004154205, "learning_rate": 2.684351445440547e-10, "loss": -0.043, "num_tokens": 76342798.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6291046142578125, "sampling/importance_sampling_ratio/mean": 1.000214695930481, "sampling/importance_sampling_ratio/min": 0.5359659194946289, "sampling/sampling_logp_difference/max": 0.6236846446990967, "sampling/sampling_logp_difference/mean": 0.017195921391248703, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 190.140625, "completions/mean_terminated_length": 190.140625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.29836857318878174, "epoch": 2.9779411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.3373957085876917, "kl": 0.13529150187969208, "learning_rate": 2.456022665127122e-10, "loss": -0.0168, "num_tokens": 76379095.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000542402267456, "sampling/importance_sampling_ratio/min": 0.5902017951011658, "sampling/sampling_logp_difference/max": 0.7390432357788086, "sampling/sampling_logp_difference/mean": 0.015541539527475834, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 150.703125, "completions/mean_terminated_length": 150.703125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2485143095254898, "epoch": 2.9791666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 1.6228619743834114, "kl": 0.1274341642856598, "learning_rate": 2.2378385824833866e-10, "loss": -0.0128, "num_tokens": 76409188.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.73202645778656, "sampling/importance_sampling_ratio/mean": 1.000281572341919, "sampling/importance_sampling_ratio/min": 0.48663073778152466, "sampling/sampling_logp_difference/max": 0.7202496528625488, "sampling/sampling_logp_difference/mean": 0.015785079449415207, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.26056230068206787, "epoch": 2.980392156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.8957602438814687, "kl": 0.0845237746834755, "learning_rate": 2.0297996404095018e-10, "loss": -0.113, "num_tokens": 76439876.0, "reward": 0.3125, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4784256219863892, "sampling/importance_sampling_ratio/mean": 0.999798059463501, "sampling/importance_sampling_ratio/min": 0.3641431927680969, "sampling/sampling_logp_difference/max": 1.0102081298828125, "sampling/sampling_logp_difference/mean": 0.014546210877597332, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 173.078125, "completions/mean_terminated_length": 173.078125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.24924761056900024, "epoch": 2.9816176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.4017548869277197, "kl": 0.08277080953121185, "learning_rate": 1.8319062612115467e-10, "loss": -0.0112, "num_tokens": 76468889.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6413885354995728, "sampling/importance_sampling_ratio/mean": 1.0004193782806396, "sampling/importance_sampling_ratio/min": 0.5419471859931946, "sampling/sampling_logp_difference/max": 0.6125867366790771, "sampling/sampling_logp_difference/mean": 0.015157599002122879, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 179.828125, "completions/mean_terminated_length": 179.828125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2005600929260254, "epoch": 2.982843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.2288829197558995, "kl": 0.056043677031993866, "learning_rate": 1.6441588466009627e-10, "loss": 0.0059, "num_tokens": 76496974.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5744365453720093, "sampling/importance_sampling_ratio/mean": 0.9994364976882935, "sampling/importance_sampling_ratio/min": 0.6059094667434692, "sampling/sampling_logp_difference/max": 0.5010247230529785, "sampling/sampling_logp_difference/mean": 0.011820659041404724, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 169.203125, "completions/mean_terminated_length": 169.203125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.242442786693573, "epoch": 2.9840686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.4555295883616384, "kl": 0.08681453764438629, "learning_rate": 1.4665577776923343e-10, "loss": 0.0132, "num_tokens": 76527131.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5895564556121826, "sampling/importance_sampling_ratio/mean": 0.9990965127944946, "sampling/importance_sampling_ratio/min": 0.35135895013809204, "sampling/sampling_logp_difference/max": 1.0459469556808472, "sampling/sampling_logp_difference/mean": 0.01459289900958538, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 175.890625, "completions/mean_terminated_length": 175.890625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2646161615848541, "epoch": 2.985294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.446731364139672, "kl": 0.09826575964689255, "learning_rate": 1.2991034150050538e-10, "loss": -0.0477, "num_tokens": 76554276.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.8623839616775513, "sampling/importance_sampling_ratio/mean": 1.0008070468902588, "sampling/importance_sampling_ratio/min": 0.5893176198005676, "sampling/sampling_logp_difference/max": 0.6218574047088623, "sampling/sampling_logp_difference/mean": 0.013790035620331764, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 166.890625, "completions/mean_terminated_length": 166.890625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.21010886132717133, "epoch": 2.986519607843137, "frac_reward_zero_std": 0.5, "grad_norm": 2.1361755153214474, "kl": 0.09595704823732376, "learning_rate": 1.1417960984605457e-10, "loss": 0.0223, "num_tokens": 76580621.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992457032203674, "sampling/importance_sampling_ratio/min": 0.5073873996734619, "sampling/sampling_logp_difference/max": 0.7147841453552246, "sampling/sampling_logp_difference/mean": 0.012746874243021011, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 229.0625, "completions/mean_terminated_length": 229.0625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.23335005342960358, "epoch": 2.9877450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.6997078272872883, "kl": 0.06975404918193817, "learning_rate": 9.946361473822662e-11, "loss": 0.0268, "num_tokens": 76614641.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.9622851610183716, "sampling/importance_sampling_ratio/mean": 1.0005013942718506, "sampling/importance_sampling_ratio/min": 0.06372610479593277, "sampling/sampling_logp_difference/max": 2.7531609535217285, "sampling/sampling_logp_difference/mean": 0.013777503743767738, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2805878520011902, "epoch": 2.9889705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 1.3875109591325976, "kl": 0.08423937112092972, "learning_rate": 8.576238604968144e-11, "loss": 0.0179, "num_tokens": 76648129.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998219013214111, "sampling/importance_sampling_ratio/min": 0.5426336526870728, "sampling/sampling_logp_difference/max": 0.7030305862426758, "sampling/sampling_logp_difference/mean": 0.015688970685005188, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 151.140625, "completions/mean_terminated_length": 151.140625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2604881227016449, "epoch": 2.9901960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.2521160841775165, "kl": 0.11699055880308151, "learning_rate": 7.307595159300461e-11, "loss": -0.0004, "num_tokens": 76677706.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.544237494468689, "sampling/importance_sampling_ratio/mean": 0.9995912313461304, "sampling/importance_sampling_ratio/min": 0.6103914976119995, "sampling/sampling_logp_difference/max": 0.493654727935791, "sampling/sampling_logp_difference/mean": 0.01679575815796852, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3009791970252991, "epoch": 2.991421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.7276816385245755, "kl": 0.1001376211643219, "learning_rate": 6.140433712076287e-11, "loss": -0.0574, "num_tokens": 76712930.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998260140419006, "sampling/importance_sampling_ratio/min": 0.3920634984970093, "sampling/sampling_logp_difference/max": 0.9363315105438232, "sampling/sampling_logp_difference/mean": 0.01611517369747162, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2741738557815552, "epoch": 2.9926470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.4004598692258408, "kl": 0.09876975417137146, "learning_rate": 5.074756632572619e-11, "loss": 0.0142, "num_tokens": 76742546.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.871956467628479, "sampling/importance_sampling_ratio/mean": 1.0001269578933716, "sampling/importance_sampling_ratio/min": 0.5572464466094971, "sampling/sampling_logp_difference/max": 0.6269841194152832, "sampling/sampling_logp_difference/mean": 0.01623227261006832, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 190.6875, "completions/mean_terminated_length": 190.6875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2576693892478943, "epoch": 2.993872549019608, "frac_reward_zero_std": 0.5, "grad_norm": 2.079763238699461, "kl": 0.07150693237781525, "learning_rate": 4.110566084036815e-11, "loss": 0.0212, "num_tokens": 76771534.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4220448732376099, "sampling/importance_sampling_ratio/mean": 0.9988895058631897, "sampling/importance_sampling_ratio/min": 0.6046943068504333, "sampling/sampling_logp_difference/max": 0.5030322074890137, "sampling/sampling_logp_difference/mean": 0.014783736318349838, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 211.5625, "completions/mean_terminated_length": 211.5625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.30968382954597473, "epoch": 2.9950980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.637648467447538, "kl": 0.10707981139421463, "learning_rate": 3.247864023719904e-11, "loss": -0.0073, "num_tokens": 76800914.0, "reward": 0.5, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000529289245605, "sampling/importance_sampling_ratio/min": 0.4752058982849121, "sampling/sampling_logp_difference/max": 1.4294970035552979, "sampling/sampling_logp_difference/mean": 0.015981096774339676, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 150.59375, "completions/mean_terminated_length": 150.59375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.19654402136802673, "epoch": 2.9963235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.3421147106329778, "kl": 0.10663546621799469, "learning_rate": 2.4866522028488268e-11, "loss": 0.0004, "num_tokens": 76827384.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6062535047531128, "sampling/importance_sampling_ratio/mean": 0.9999834895133972, "sampling/importance_sampling_ratio/min": 0.47787678241729736, "sampling/sampling_logp_difference/max": 0.7384023666381836, "sampling/sampling_logp_difference/mean": 0.012441320344805717, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 186.046875, "completions/mean_terminated_length": 186.046875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.26118266582489014, "epoch": 2.997549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.3243641853561114, "kl": 0.11284665763378143, "learning_rate": 1.8269321666375403e-11, "loss": -0.0286, "num_tokens": 76856539.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6207815408706665, "sampling/importance_sampling_ratio/mean": 0.9997130036354065, "sampling/importance_sampling_ratio/min": 0.3909359574317932, "sampling/sampling_logp_difference/max": 0.9392115473747253, "sampling/sampling_logp_difference/mean": 0.016125911846756935, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 214.484375, "completions/mean_terminated_length": 214.484375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24739396572113037, "epoch": 2.998774509803922, "frac_reward_zero_std": 0.5, "grad_norm": 1.6065085595715245, "kl": 0.09260021150112152, "learning_rate": 1.2687052542759147e-11, "loss": 0.0473, "num_tokens": 76889114.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6634241342544556, "sampling/importance_sampling_ratio/mean": 1.0005452632904053, "sampling/importance_sampling_ratio/min": 0.5677346587181091, "sampling/sampling_logp_difference/max": 0.56610107421875, "sampling/sampling_logp_difference/mean": 0.014586443081498146, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 195.234375, "completions/mean_terminated_length": 195.234375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2355600744485855, "epoch": 3.0, "frac_reward_zero_std": 0.75, "grad_norm": 1.2193175282367605, "kl": 0.07594628632068634, "learning_rate": 8.119725989241822e-12, "loss": 0.0215, "num_tokens": 76916233.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5596669912338257, "sampling/importance_sampling_ratio/mean": 0.9998251795768738, "sampling/importance_sampling_ratio/min": 0.5702371001243591, "sampling/sampling_logp_difference/max": 0.561703085899353, "sampling/sampling_logp_difference/mean": 0.013978696428239346, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 215.984375, "completions/mean_terminated_length": 215.984375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2412678301334381, "epoch": 3.0012254901960786, "frac_reward_zero_std": 0.5, "grad_norm": 1.882924046267253, "kl": 0.12087821215391159, "learning_rate": 4.5673512772959055e-12, "loss": 0.051, "num_tokens": 76951288.0, "reward": 0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992916584014893, "sampling/importance_sampling_ratio/min": 0.32853004336357117, "sampling/sampling_logp_difference/max": 1.1131269931793213, "sampling/sampling_logp_difference/mean": 0.014975108206272125, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 152.46875, "completions/mean_terminated_length": 152.46875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.2571891248226166, "epoch": 3.002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.7295956536656483, "kl": 0.14601531624794006, "learning_rate": 2.0299356179309666e-12, "loss": 0.0302, "num_tokens": 76984662.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5563182830810547, "sampling/importance_sampling_ratio/mean": 0.9998385906219482, "sampling/importance_sampling_ratio/min": 0.6069795489311218, "sampling/sampling_logp_difference/max": 0.4992602467536926, "sampling/sampling_logp_difference/mean": 0.015918415039777756, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 163.515625, "completions/mean_terminated_length": 163.515625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.21576866507530212, "epoch": 3.0036764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.07391588468035754, "kl": 0.06929220259189606, "learning_rate": 5.074841620267278e-13, "loss": 0.0007, "num_tokens": 77010407.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5475221872329712, "sampling/importance_sampling_ratio/mean": 0.9999107122421265, "sampling/importance_sampling_ratio/min": 0.22164645791053772, "sampling/sampling_logp_difference/max": 1.506671667098999, "sampling/sampling_logp_difference/mean": 0.01383669301867485, "step": 2451 } ], "logging_steps": 1, "max_steps": 2451, "num_input_tokens_seen": 77010407, "num_train_epochs": 4, "save_steps": 817, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }